xbrz.cpp 52 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384
  1. // ****************************************************************************
  2. // * This file is part of the xBRZ project. It is distributed under *
  3. // * GNU General Public License: https://www.gnu.org/licenses/gpl-3.0 *
  4. // * Copyright (C) Zenju (zenju AT gmx DOT de) - All Rights Reserved *
  5. // * *
  6. // * Additionally and as a special exception, the author gives permission *
  7. // * to link the code of this program with the following libraries *
  8. // * (or with modified versions that use the same licenses), and distribute *
  9. // * linked combinations including the two: MAME, FreeFileSync, Snes9x, ePSXe *
  10. // * You must obey the GNU General Public License in all respects for all of *
  11. // * the code used other than MAME, FreeFileSync, Snes9x, ePSXe. *
  12. // * If you modify this file, you may extend this exception to your version *
  13. // * of the file, but you are not obligated to do so. If you do not wish to *
  14. // * do so, delete this exception statement from your version. *
  15. // ****************************************************************************
  16. #include "xbrz.h"
  17. #include <cassert>
  18. #include <vector>
  19. #include <algorithm>
  20. #include <cmath> //std::sqrt
  21. #include "xbrz_tools.h"
  22. #if defined _MSC_VER
  23. #pragma warning(disable:5051)
  24. #endif
  25. using namespace xbrz;
  26. namespace
  27. {
  28. template <unsigned int M, unsigned int N> inline
  29. uint32_t gradientRGB(uint32_t pixFront, uint32_t pixBack) //blend front color with opacity M / N over opaque background: https://en.wikipedia.org/wiki/Alpha_compositing#Alpha_blending
  30. {
  31. static_assert(0 < M && M < N && N <= 1000);
  32. auto calcColor = [](unsigned char colFront, unsigned char colBack) -> unsigned char { return (colFront * M + colBack * (N - M)) / N; };
  33. return makePixel(calcColor(getRed (pixFront), getRed (pixBack)),
  34. calcColor(getGreen(pixFront), getGreen(pixBack)),
  35. calcColor(getBlue (pixFront), getBlue (pixBack)));
  36. }
  37. template <unsigned int M, unsigned int N> inline
  38. uint32_t gradientARGB(uint32_t pixFront, uint32_t pixBack) //find intermediate color between two colors with alpha channels (=> NO alpha blending!!!)
  39. {
  40. static_assert(0 < M && M < N && N <= 1000);
  41. const unsigned int weightFront = getAlpha(pixFront) * M;
  42. const unsigned int weightBack = getAlpha(pixBack) * (N - M);
  43. const unsigned int weightSum = weightFront + weightBack;
  44. if (weightSum == 0)
  45. return 0;
  46. auto calcColor = [=](unsigned char colFront, unsigned char colBack)
  47. {
  48. return static_cast<unsigned char>((colFront * weightFront + colBack * weightBack) / weightSum);
  49. };
  50. return makePixel(static_cast<unsigned char>(weightSum / N),
  51. calcColor(getRed (pixFront), getRed (pixBack)),
  52. calcColor(getGreen(pixFront), getGreen(pixBack)),
  53. calcColor(getBlue (pixFront), getBlue (pixBack)));
  54. }
  55. //inline
  56. //double fastSqrt(double n)
  57. //{
  58. // __asm //speeds up xBRZ by about 9% compared to std::sqrt which internally uses the same assembler instructions but adds some "fluff"
  59. // {
  60. // fld n
  61. // fsqrt
  62. // }
  63. //}
  64. //
  65. #ifdef _MSC_VER
  66. #define FORCE_INLINE __forceinline
  67. #elif defined __GNUC__
  68. #define FORCE_INLINE __attribute__((always_inline)) inline
  69. #else
  70. #define FORCE_INLINE inline
  71. #endif
  72. enum RotationDegree //clock-wise
  73. {
  74. ROT_0,
  75. ROT_90,
  76. ROT_180,
  77. ROT_270
  78. };
  79. //calculate input matrix coordinates after rotation at compile time
  80. template <RotationDegree rotDeg, size_t I, size_t J, size_t N>
  81. struct MatrixRotation;
  82. template <size_t I, size_t J, size_t N>
  83. struct MatrixRotation<ROT_0, I, J, N>
  84. {
  85. static const size_t I_old = I;
  86. static const size_t J_old = J;
  87. };
  88. template <RotationDegree rotDeg, size_t I, size_t J, size_t N> //(i, j) = (row, col) indices, N = size of (square) matrix
  89. struct MatrixRotation
  90. {
  91. static const size_t I_old = N - 1 - MatrixRotation<static_cast<RotationDegree>(rotDeg - 1), I, J, N>::J_old; //old coordinates before rotation!
  92. static const size_t J_old = MatrixRotation<static_cast<RotationDegree>(rotDeg - 1), I, J, N>::I_old; //
  93. };
  94. template <size_t N, RotationDegree rotDeg>
  95. class OutputMatrix
  96. {
  97. public:
  98. OutputMatrix(uint32_t* out, int outWidth) : //access matrix area, top-left at position "out" for image with given width
  99. out_(out),
  100. outWidth_(outWidth) {}
  101. template <size_t I, size_t J>
  102. uint32_t& ref() const
  103. {
  104. static const size_t I_old = MatrixRotation<rotDeg, I, J, N>::I_old;
  105. static const size_t J_old = MatrixRotation<rotDeg, I, J, N>::J_old;
  106. return *(out_ + J_old + I_old * outWidth_);
  107. }
  108. private:
  109. uint32_t* out_;
  110. const int outWidth_;
  111. };
  112. template <class T> inline
  113. T square(T value) { return value * value; }
  114. #if 0
  115. inline
  116. double distRGB(uint32_t pix1, uint32_t pix2)
  117. {
  118. const double r_diff = static_cast<int>(getRed (pix1)) - getRed (pix2);
  119. const double g_diff = static_cast<int>(getGreen(pix1)) - getGreen(pix2);
  120. const double b_diff = static_cast<int>(getBlue (pix1)) - getBlue (pix2);
  121. //euklidean RGB distance
  122. return std::sqrt(square(r_diff) + square(g_diff) + square(b_diff));
  123. }
  124. #endif
  125. inline
  126. double distYCbCr(uint32_t pix1, uint32_t pix2, double lumaWeight)
  127. {
  128. //https://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.601_conversion
  129. //YCbCr conversion is a matrix multiplication => take advantage of linearity by subtracting first!
  130. const int r_diff = static_cast<int>(getRed (pix1)) - getRed (pix2); //we may delay division by 255 to after matrix multiplication
  131. const int g_diff = static_cast<int>(getGreen(pix1)) - getGreen(pix2); //
  132. const int b_diff = static_cast<int>(getBlue (pix1)) - getBlue (pix2); //substraction for int is noticeable faster than for double!
  133. //const double k_b = 0.0722; //ITU-R BT.709 conversion
  134. //const double k_r = 0.2126; //
  135. const double k_b = 0.0593; //ITU-R BT.2020 conversion
  136. const double k_r = 0.2627; //
  137. const double k_g = 1 - k_b - k_r;
  138. const double scale_b = 0.5 / (1 - k_b);
  139. const double scale_r = 0.5 / (1 - k_r);
  140. const double y = k_r * r_diff + k_g * g_diff + k_b * b_diff; //[!], analog YCbCr!
  141. const double c_b = scale_b * (b_diff - y);
  142. const double c_r = scale_r * (r_diff - y);
  143. //we skip division by 255 to have similar range like other distance functions
  144. return std::sqrt(square(lumaWeight * y) + square(c_b) + square(c_r));
  145. }
  146. inline
  147. double distYCbCrBuffered(uint32_t pix1, uint32_t pix2)
  148. {
  149. //30% perf boost compared to plain distYCbCr()!
  150. //consumes 64 MB memory; using double is only 2% faster, but takes 128 MB
  151. static const std::vector<float> diffToDist = []
  152. {
  153. std::vector<float> tmp;
  154. for (uint32_t i = 0; i < 256 * 256 * 256; ++i) //startup time: 114 ms on Intel Core i5 (four cores)
  155. {
  156. const int r_diff = static_cast<signed char>(getByte<2>(i)) * 2;
  157. const int g_diff = static_cast<signed char>(getByte<1>(i)) * 2;
  158. const int b_diff = static_cast<signed char>(getByte<0>(i)) * 2;
  159. const double k_b = 0.0593; //ITU-R BT.2020 conversion
  160. const double k_r = 0.2627; //
  161. const double k_g = 1 - k_b - k_r;
  162. const double scale_b = 0.5 / (1 - k_b);
  163. const double scale_r = 0.5 / (1 - k_r);
  164. const double y = k_r * r_diff + k_g * g_diff + k_b * b_diff; //[!], analog YCbCr!
  165. const double c_b = scale_b * (b_diff - y);
  166. const double c_r = scale_r * (r_diff - y);
  167. tmp.push_back(static_cast<float>(std::sqrt(square(y) + square(c_b) + square(c_r))));
  168. }
  169. return tmp;
  170. }();
  171. //if (pix1 == pix2) -> 8% perf degradation!
  172. // return 0;
  173. //if (pix1 < pix2)
  174. // std::swap(pix1, pix2); -> 30% perf degradation!!!
  175. const int r_diff = static_cast<int>(getRed (pix1)) - getRed (pix2);
  176. const int g_diff = static_cast<int>(getGreen(pix1)) - getGreen(pix2);
  177. const int b_diff = static_cast<int>(getBlue (pix1)) - getBlue (pix2);
  178. const size_t index = (static_cast<unsigned char>(r_diff / 2) << 16) | //slightly reduce precision (division by 2) to squeeze value into single byte
  179. (static_cast<unsigned char>(g_diff / 2) << 8) |
  180. (static_cast<unsigned char>(b_diff / 2));
  181. #if 0 //attention: the following calculation creates an asymmetric color distance!!! (e.g. r_diff=46 will be unpacked as 45, but r_diff=-46 unpacks to -47
  182. const size_t index = (((r_diff + 0xFF) / 2) << 16) | //slightly reduce precision (division by 2) to squeeze value into single byte
  183. (((g_diff + 0xFF) / 2) << 8) |
  184. (( b_diff + 0xFF) / 2);
  185. #endif
  186. return diffToDist[index];
  187. }
  188. #if defined _MSC_VER && !defined NDEBUG
  189. const int debugPixelX = -1;
  190. const int debugPixelY = 58;
  191. thread_local bool breakIntoDebugger = false;
  192. #endif
  193. enum BlendType
  194. {
  195. BLEND_NONE = 0,
  196. BLEND_NORMAL, //a normal indication to blend
  197. BLEND_DOMINANT, //a strong indication to blend
  198. //attention: BlendType must fit into the value range of 2 bit!!!
  199. };
  200. struct BlendResult
  201. {
  202. BlendType
  203. /**/blend_f, blend_g,
  204. /**/blend_j, blend_k;
  205. };
  206. struct Kernel_3x3
  207. {
  208. uint32_t
  209. a, b, c,
  210. d, e, f,
  211. g, h, i;
  212. };
  213. struct Kernel_4x4 //kernel for preprocessing step
  214. {
  215. uint32_t
  216. a, b, c, //
  217. e, f, g, // support reinterpret_cast from Kernel_4x4 => Kernel_3x3
  218. i, j, k, //
  219. m, n, o,
  220. d, h, l, p;
  221. };
  222. /* input kernel area naming convention:
  223. -----------------
  224. | A | B | C | D |
  225. |---|---|---|---|
  226. | E | F | G | H | evaluate the four corners between F, G, J, K
  227. |---|---|---|---| input pixel is at position F
  228. | I | J | K | L |
  229. |---|---|---|---|
  230. | M | N | O | P |
  231. -----------------
  232. */
  233. template <class ColorDistance>
  234. FORCE_INLINE //detect blend direction
  235. BlendResult preProcessCorners(const Kernel_4x4& ker, const xbrz::ScalerCfg& cfg) //result: F, G, J, K corners of "GradientType"
  236. {
  237. #if defined _MSC_VER && !defined NDEBUG
  238. if (breakIntoDebugger)
  239. __debugbreak(); //__asm int 3;
  240. #endif
  241. BlendResult result = {};
  242. if ((ker.f == ker.g &&
  243. ker.j == ker.k) ||
  244. (ker.f == ker.j &&
  245. ker.g == ker.k))
  246. return result;
  247. auto dist = [&](uint32_t pix1, uint32_t pix2) { return ColorDistance::dist(pix1, pix2, cfg.luminanceWeight); };
  248. double jg = dist(ker.i, ker.f) + dist(ker.f, ker.c) + dist(ker.n, ker.k) + dist(ker.k, ker.h) + cfg.centerDirectionBias * dist(ker.j, ker.g);
  249. double fk = dist(ker.e, ker.j) + dist(ker.j, ker.o) + dist(ker.b, ker.g) + dist(ker.g, ker.l) + cfg.centerDirectionBias * dist(ker.f, ker.k);
  250. if (jg < fk) //test sample: 70% of values max(jg, fk) / min(jg, fk) are between 1.1 and 3.7 with median being 1.8
  251. {
  252. const bool dominantGradient = cfg.dominantDirectionThreshold * jg < fk;
  253. if (ker.f != ker.g && ker.f != ker.j)
  254. result.blend_f = dominantGradient ? BLEND_DOMINANT : BLEND_NORMAL;
  255. if (ker.k != ker.j && ker.k != ker.g)
  256. result.blend_k = dominantGradient ? BLEND_DOMINANT : BLEND_NORMAL;
  257. }
  258. else if (fk < jg)
  259. {
  260. const bool dominantGradient = cfg.dominantDirectionThreshold * fk < jg;
  261. if (ker.j != ker.f && ker.j != ker.k)
  262. result.blend_j = dominantGradient ? BLEND_DOMINANT : BLEND_NORMAL;
  263. if (ker.g != ker.f && ker.g != ker.k)
  264. result.blend_g = dominantGradient ? BLEND_DOMINANT : BLEND_NORMAL;
  265. }
  266. return result;
  267. }
  268. #define DEF_GETTER(x) template <RotationDegree rotDeg> uint32_t inline get_##x(const Kernel_3x3& ker) { return ker.x; }
  269. //we cannot and NEED NOT write "ker.##x" since ## concatenates preprocessor tokens but "." is not a token
  270. DEF_GETTER(a) DEF_GETTER(b) DEF_GETTER(c)
  271. DEF_GETTER(d) DEF_GETTER(e) DEF_GETTER(f)
  272. DEF_GETTER(g) DEF_GETTER(h) DEF_GETTER(i)
  273. #undef DEF_GETTER
  274. #define DEF_GETTER(x, y) template <> [[maybe_unused]] inline uint32_t get_##x<ROT_90>(const Kernel_3x3& ker) { return ker.y; }
  275. DEF_GETTER(a, g) DEF_GETTER(b, d) DEF_GETTER(c, a)
  276. DEF_GETTER(d, h) DEF_GETTER(e, e) DEF_GETTER(f, b)
  277. DEF_GETTER(g, i) DEF_GETTER(h, f) DEF_GETTER(i, c)
  278. #undef DEF_GETTER
  279. #define DEF_GETTER(x, y) template <> [[maybe_unused]] inline uint32_t get_##x<ROT_180>(const Kernel_3x3& ker) { return ker.y; }
  280. DEF_GETTER(a, i) DEF_GETTER(b, h) DEF_GETTER(c, g)
  281. DEF_GETTER(d, f) DEF_GETTER(e, e) DEF_GETTER(f, d)
  282. DEF_GETTER(g, c) DEF_GETTER(h, b) DEF_GETTER(i, a)
  283. #undef DEF_GETTER
  284. #define DEF_GETTER(x, y) template <> [[maybe_unused]] inline uint32_t get_##x<ROT_270>(const Kernel_3x3& ker) { return ker.y; }
  285. DEF_GETTER(a, c) DEF_GETTER(b, f) DEF_GETTER(c, i)
  286. DEF_GETTER(d, b) DEF_GETTER(e, e) DEF_GETTER(f, h)
  287. DEF_GETTER(g, a) DEF_GETTER(h, d) DEF_GETTER(i, g)
  288. #undef DEF_GETTER
  289. //compress four blend types into a single byte
  290. //inline BlendType getTopL (unsigned char b) { return static_cast<BlendType>(0x3 & b); }
  291. inline BlendType getTopR (unsigned char b) { return static_cast<BlendType>(0x3 & (b >> 2)); }
  292. inline BlendType getBottomR(unsigned char b) { return static_cast<BlendType>(0x3 & (b >> 4)); }
  293. inline BlendType getBottomL(unsigned char b) { return static_cast<BlendType>(0x3 & (b >> 6)); }
  294. inline void clearAddTopL(unsigned char& b, BlendType bt) { b = static_cast<unsigned char>(bt); }
  295. inline void addTopR (unsigned char& b, BlendType bt) { b |= (bt << 2); } //buffer is assumed to be initialized before preprocessing!
  296. inline void addBottomR (unsigned char& b, BlendType bt) { b |= (bt << 4); } //e.g. via clearAddTopL()
  297. inline void addBottomL (unsigned char& b, BlendType bt) { b |= (bt << 6); } //
  298. inline bool blendingNeeded(unsigned char b)
  299. {
  300. static_assert(BLEND_NONE == 0);
  301. return b != 0;
  302. }
  303. template <RotationDegree rotDeg> inline
  304. unsigned char rotateBlendInfo(unsigned char b) { return b; }
  305. template <> inline unsigned char rotateBlendInfo<ROT_90 >(unsigned char b) { return ((b << 2) | (b >> 6)) & 0xff; }
  306. template <> inline unsigned char rotateBlendInfo<ROT_180>(unsigned char b) { return ((b << 4) | (b >> 4)) & 0xff; }
  307. template <> inline unsigned char rotateBlendInfo<ROT_270>(unsigned char b) { return ((b << 6) | (b >> 2)) & 0xff; }
  308. /* input kernel area naming convention:
  309. -------------
  310. | A | B | C |
  311. |---|---|---|
  312. | D | E | F | input pixel is at position E
  313. |---|---|---|
  314. | G | H | I |
  315. -------------
  316. */
  317. template <class Scaler, class ColorDistance, RotationDegree rotDeg>
  318. FORCE_INLINE //perf: quite worth it!
  319. void blendPixel(const Kernel_3x3& ker,
  320. uint32_t* target, int trgWidth,
  321. unsigned char blendInfo, //result of preprocessing all four corners of pixel "e"
  322. const xbrz::ScalerCfg& cfg)
  323. {
  324. //#define a get_a<rotDeg>(ker)
  325. #define b get_b<rotDeg>(ker)
  326. #define c get_c<rotDeg>(ker)
  327. #define d get_d<rotDeg>(ker)
  328. #define e get_e<rotDeg>(ker)
  329. #define f get_f<rotDeg>(ker)
  330. #define g get_g<rotDeg>(ker)
  331. #define h get_h<rotDeg>(ker)
  332. #define i get_i<rotDeg>(ker)
  333. #if defined _MSC_VER && !defined NDEBUG
  334. if (breakIntoDebugger)
  335. __debugbreak(); //__asm int 3;
  336. #endif
  337. const unsigned char blend = rotateBlendInfo<rotDeg>(blendInfo);
  338. if (getBottomR(blend) >= BLEND_NORMAL)
  339. {
  340. auto eq = [&](uint32_t pix1, uint32_t pix2) { return ColorDistance::dist(pix1, pix2, cfg.luminanceWeight) < cfg.equalColorTolerance; };
  341. auto dist = [&](uint32_t pix1, uint32_t pix2) { return ColorDistance::dist(pix1, pix2, cfg.luminanceWeight); };
  342. const bool doLineBlend = [&]() -> bool
  343. {
  344. if (getBottomR(blend) >= BLEND_DOMINANT)
  345. return true;
  346. //make sure there is no second blending in an adjacent rotation for this pixel: handles insular pixels, mario eyes
  347. if (getTopR(blend) != BLEND_NONE && !eq(e, g)) //but support double-blending for 90° corners
  348. return false;
  349. if (getBottomL(blend) != BLEND_NONE && !eq(e, c))
  350. return false;
  351. //no full blending for L-shapes; blend corner only (handles "mario mushroom eyes")
  352. if (!eq(e, i) && eq(g, h) && eq(h, i) && eq(i, f) && eq(f, c))
  353. return false;
  354. return true;
  355. }();
  356. const uint32_t px = dist(e, f) <= dist(e, h) ? f : h; //choose most similar color
  357. OutputMatrix<Scaler::scale, rotDeg> out(target, trgWidth);
  358. if (doLineBlend)
  359. {
  360. const double fg = dist(f, g); //test sample: 70% of values max(fg, hc) / min(fg, hc) are between 1.1 and 3.7 with median being 1.9
  361. const double hc = dist(h, c); //
  362. const bool haveShallowLine = cfg.steepDirectionThreshold * fg <= hc && e != g && d != g;
  363. const bool haveSteepLine = cfg.steepDirectionThreshold * hc <= fg && e != c && b != c;
  364. if (haveShallowLine)
  365. {
  366. if (haveSteepLine)
  367. Scaler::blendLineSteepAndShallow(px, out);
  368. else
  369. Scaler::blendLineShallow(px, out);
  370. }
  371. else
  372. {
  373. if (haveSteepLine)
  374. Scaler::blendLineSteep(px, out);
  375. else
  376. Scaler::blendLineDiagonal(px, out);
  377. }
  378. }
  379. else
  380. Scaler::blendCorner(px, out);
  381. }
  382. //#undef a
  383. #undef b
  384. #undef c
  385. #undef d
  386. #undef e
  387. #undef f
  388. #undef g
  389. #undef h
  390. #undef i
  391. }
  392. class OobReaderTransparent
  393. {
  394. public:
  395. OobReaderTransparent(const uint32_t* src, int srcWidth, int srcHeight, int y) :
  396. s_m1(0 <= y - 1 && y - 1 < srcHeight ? src + srcWidth * (y - 1) : nullptr),
  397. s_0 (0 <= y && y < srcHeight ? src + srcWidth * y : nullptr),
  398. s_p1(0 <= y + 1 && y + 1 < srcHeight ? src + srcWidth * (y + 1) : nullptr),
  399. s_p2(0 <= y + 2 && y + 2 < srcHeight ? src + srcWidth * (y + 2) : nullptr),
  400. srcWidth_(srcWidth) {}
  401. void readDhlp(Kernel_4x4& ker, int x) const //(x, y) is at kernel position F
  402. {
  403. [[likely]] if (const int x_p2 = x + 2; 0 <= x_p2 && x_p2 < srcWidth_)
  404. {
  405. ker.d = s_m1 ? s_m1[x_p2] : 0;
  406. ker.h = s_0 ? s_0 [x_p2] : 0;
  407. ker.l = s_p1 ? s_p1[x_p2] : 0;
  408. ker.p = s_p2 ? s_p2[x_p2] : 0;
  409. }
  410. else
  411. {
  412. ker.d = 0;
  413. ker.h = 0;
  414. ker.l = 0;
  415. ker.p = 0;
  416. }
  417. }
  418. private:
  419. const uint32_t* const s_m1;
  420. const uint32_t* const s_0;
  421. const uint32_t* const s_p1;
  422. const uint32_t* const s_p2;
  423. const int srcWidth_;
  424. };
  425. class OobReaderDuplicate
  426. {
  427. public:
  428. OobReaderDuplicate(const uint32_t* src, int srcWidth, int srcHeight, int y) :
  429. s_m1(src + srcWidth * std::clamp(y - 1, 0, srcHeight - 1)),
  430. s_0 (src + srcWidth * std::clamp(y, 0, srcHeight - 1)),
  431. s_p1(src + srcWidth * std::clamp(y + 1, 0, srcHeight - 1)),
  432. s_p2(src + srcWidth * std::clamp(y + 2, 0, srcHeight - 1)),
  433. srcWidth_(srcWidth) {}
  434. void readDhlp(Kernel_4x4& ker, int x) const //(x, y) is at kernel position F
  435. {
  436. const int x_p2 = std::clamp(x + 2, 0, srcWidth_ - 1);
  437. ker.d = s_m1[x_p2];
  438. ker.h = s_0 [x_p2];
  439. ker.l = s_p1[x_p2];
  440. ker.p = s_p2[x_p2];
  441. }
  442. private:
  443. const uint32_t* const s_m1;
  444. const uint32_t* const s_0;
  445. const uint32_t* const s_p1;
  446. const uint32_t* const s_p2;
  447. const int srcWidth_;
  448. };
  449. template <class Scaler, class ColorDistance, class OobReader> //scaler policy: see "Scaler2x" reference implementation
  450. void scaleImage(const uint32_t* src, uint32_t* trg, int srcWidth, int srcHeight, const xbrz::ScalerCfg& cfg, int yFirst, int yLast)
  451. {
  452. yFirst = std::max(yFirst, 0);
  453. yLast = std::min(yLast, srcHeight);
  454. if (yFirst >= yLast || srcWidth <= 0)
  455. return;
  456. const int trgWidth = srcWidth * Scaler::scale;
  457. //(ab)use space of "sizeof(uint32_t) * srcWidth * Scaler::scale" at the end of the image as temporary
  458. //buffer for "on the fly preprocessing" without risk of accidental overwriting before accessing
  459. unsigned char* const preProcBuf = reinterpret_cast<unsigned char*>(trg + yLast * Scaler::scale * trgWidth) - srcWidth;
  460. //initialize preprocessing buffer for first row of current stripe: detect upper left and right corner blending
  461. //this cannot be optimized for adjacent processing stripes; we must not allow for a memory race condition!
  462. {
  463. const OobReader oobReader(src, srcWidth, srcHeight, yFirst - 1);
  464. //initialize at position x = -1
  465. Kernel_4x4 ker4 = {};
  466. oobReader.readDhlp(ker4, -4); //hack: read a, e, i, m at x = -1
  467. ker4.a = ker4.d;
  468. ker4.e = ker4.h;
  469. ker4.i = ker4.l;
  470. ker4.m = ker4.p;
  471. oobReader.readDhlp(ker4, -3);
  472. ker4.b = ker4.d;
  473. ker4.f = ker4.h;
  474. ker4.j = ker4.l;
  475. ker4.n = ker4.p;
  476. oobReader.readDhlp(ker4, -2);
  477. ker4.c = ker4.d;
  478. ker4.g = ker4.h;
  479. ker4.k = ker4.l;
  480. ker4.o = ker4.p;
  481. oobReader.readDhlp(ker4, -1);
  482. {
  483. const BlendResult res = preProcessCorners<ColorDistance>(ker4, cfg);
  484. clearAddTopL(preProcBuf[0], res.blend_k); //set 1st known corner for (0, yFirst)
  485. }
  486. for (int x = 0; x < srcWidth; ++x)
  487. {
  488. ker4.a = ker4.b; //shift previous kernel to the left
  489. ker4.e = ker4.f; // -----------------
  490. ker4.i = ker4.j; // | A | B | C | D |
  491. ker4.m = ker4.n; // |---|---|---|---|
  492. /**/ // | E | F | G | H | (x, yFirst - 1) is at position F
  493. ker4.b = ker4.c; // |---|---|---|---|
  494. ker4.f = ker4.g; // | I | J | K | L |
  495. ker4.j = ker4.k; // |---|---|---|---|
  496. ker4.n = ker4.o; // | M | N | O | P |
  497. /**/ // -----------------
  498. ker4.c = ker4.d;
  499. ker4.g = ker4.h;
  500. ker4.k = ker4.l;
  501. ker4.o = ker4.p;
  502. oobReader.readDhlp(ker4, x);
  503. /* preprocessing blend result:
  504. ---------
  505. | F | G | evaluate corner between F, G, J, K
  506. |---+---| current input pixel is at position F
  507. | J | K |
  508. --------- */
  509. const BlendResult res = preProcessCorners<ColorDistance>(ker4, cfg);
  510. addTopR(preProcBuf[x], res.blend_j); //set 2nd known corner for (x, yFirst)
  511. if (x + 1 < srcWidth)
  512. clearAddTopL(preProcBuf[x + 1], res.blend_k); //set 1st known corner for (x + 1, yFirst)
  513. }
  514. }
  515. //------------------------------------------------------------------------------------
  516. for (int y = yFirst; y < yLast; ++y)
  517. {
  518. uint32_t* out = trg + Scaler::scale * y * trgWidth; //consider MT "striped" access
  519. const OobReader oobReader(src, srcWidth, srcHeight, y);
  520. //initialize at position x = -1
  521. Kernel_4x4 ker4 = {};
  522. oobReader.readDhlp(ker4, -4); //hack: read a, e, i, m at x = -1
  523. ker4.a = ker4.d;
  524. ker4.e = ker4.h;
  525. ker4.i = ker4.l;
  526. ker4.m = ker4.p;
  527. oobReader.readDhlp(ker4, -3);
  528. ker4.b = ker4.d;
  529. ker4.f = ker4.h;
  530. ker4.j = ker4.l;
  531. ker4.n = ker4.p;
  532. oobReader.readDhlp(ker4, -2);
  533. ker4.c = ker4.d;
  534. ker4.g = ker4.h;
  535. ker4.k = ker4.l;
  536. ker4.o = ker4.p;
  537. oobReader.readDhlp(ker4, -1);
  538. unsigned char blend_xy1 = 0; //corner blending for current (x, y + 1) position
  539. {
  540. const BlendResult res = preProcessCorners<ColorDistance>(ker4, cfg);
  541. clearAddTopL(blend_xy1, res.blend_k); //set 1st known corner for (0, y + 1) and buffer for use on next column
  542. addBottomL(preProcBuf[0], res.blend_g); //set 3rd known corner for (0, y)
  543. }
  544. for (int x = 0; x < srcWidth; ++x, out += Scaler::scale)
  545. {
  546. #if defined _MSC_VER && !defined NDEBUG
  547. breakIntoDebugger = debugPixelX == x && debugPixelY == y;
  548. #endif
  549. ker4.a = ker4.b; //shift previous kernel to the left
  550. ker4.e = ker4.f; // -----------------
  551. ker4.i = ker4.j; // | A | B | C | D |
  552. ker4.m = ker4.n; // |---|---|---|---|
  553. /**/ // | E | F | G | H | (x, y) is at position F
  554. ker4.b = ker4.c; // |---|---|---|---|
  555. ker4.f = ker4.g; // | I | J | K | L |
  556. ker4.j = ker4.k; // |---|---|---|---|
  557. ker4.n = ker4.o; // | M | N | O | P |
  558. /**/ // -----------------
  559. ker4.c = ker4.d;
  560. ker4.g = ker4.h;
  561. ker4.k = ker4.l;
  562. ker4.o = ker4.p;
  563. oobReader.readDhlp(ker4, x);
  564. //evaluate the four corners on bottom-right of current pixel
  565. unsigned char blend_xy = preProcBuf[x]; //for current (x, y) position
  566. {
  567. /* preprocessing blend result:
  568. ---------
  569. | F | G | evaluate corner between F, G, J, K
  570. |---+---| current input pixel is at position F
  571. | J | K |
  572. --------- */
  573. const BlendResult res = preProcessCorners<ColorDistance>(ker4, cfg);
  574. addBottomR(blend_xy, res.blend_f); //all four corners of (x, y) have been determined at this point due to processing sequence!
  575. addTopR(blend_xy1, res.blend_j); //set 2nd known corner for (x, y + 1)
  576. preProcBuf[x] = blend_xy1; //store on current buffer position for use on next row
  577. [[likely]] if (x + 1 < srcWidth)
  578. {
  579. //blend_xy1 -> blend_x1y1
  580. clearAddTopL(blend_xy1, res.blend_k); //set 1st known corner for (x + 1, y + 1) and buffer for use on next column
  581. addBottomL(preProcBuf[x + 1], res.blend_g); //set 3rd known corner for (x + 1, y)
  582. }
  583. }
  584. //fill block of size scale * scale with the given color
  585. fillBlock(out, trgWidth * sizeof(uint32_t), ker4.f, Scaler::scale, Scaler::scale);
  586. //place *after* preprocessing step, to not overwrite the results while processing the last pixel!
  587. //blend all four corners of current pixel
  588. if (blendingNeeded(blend_xy))
  589. {
  590. #ifndef _MSC_VER
  591. #pragma GCC diagnostic push
  592. #pragma GCC diagnostic ignored "-Wstrict-aliasing"
  593. #endif
  594. const auto& ker3 = reinterpret_cast<const Kernel_3x3&>(ker4); //"The Things We Do for Perf"
  595. blendPixel<Scaler, ColorDistance, ROT_0 >(ker3, out, trgWidth, blend_xy, cfg);
  596. blendPixel<Scaler, ColorDistance, ROT_90 >(ker3, out, trgWidth, blend_xy, cfg);
  597. blendPixel<Scaler, ColorDistance, ROT_180>(ker3, out, trgWidth, blend_xy, cfg);
  598. blendPixel<Scaler, ColorDistance, ROT_270>(ker3, out, trgWidth, blend_xy, cfg);
  599. #ifndef _MSC_VER
  600. #pragma GCC diagnostic pop
  601. #endif
  602. }
  603. }
  604. }
  605. }
  606. //------------------------------------------------------------------------------------
  607. template <class ColorGradient>
  608. struct Scaler2x : public ColorGradient
  609. {
  610. static const int scale = 2;
  611. template <unsigned int M, unsigned int N> //bring template function into scope for GCC
  612. static void alphaGrad(uint32_t& pixBack, uint32_t pixFront) { ColorGradient::template alphaGrad<M, N>(pixBack, pixFront); }
  613. template <class OutputMatrix>
  614. static void blendLineShallow(uint32_t col, OutputMatrix& out)
  615. {
  616. alphaGrad<1, 4>(out.template ref<scale - 1, 0>(), col);
  617. alphaGrad<3, 4>(out.template ref<scale - 1, 1>(), col);
  618. }
  619. template <class OutputMatrix>
  620. static void blendLineSteep(uint32_t col, OutputMatrix& out)
  621. {
  622. alphaGrad<1, 4>(out.template ref<0, scale - 1>(), col);
  623. alphaGrad<3, 4>(out.template ref<1, scale - 1>(), col);
  624. }
  625. template <class OutputMatrix>
  626. static void blendLineSteepAndShallow(uint32_t col, OutputMatrix& out)
  627. {
  628. alphaGrad<1, 4>(out.template ref<1, 0>(), col);
  629. alphaGrad<1, 4>(out.template ref<0, 1>(), col);
  630. alphaGrad<5, 6>(out.template ref<1, 1>(), col); //[!] fixes 7/8 used in xBR
  631. }
  632. template <class OutputMatrix>
  633. static void blendLineDiagonal(uint32_t col, OutputMatrix& out)
  634. {
  635. alphaGrad<1, 2>(out.template ref<1, 1>(), col);
  636. }
  637. template <class OutputMatrix>
  638. static void blendCorner(uint32_t col, OutputMatrix& out)
  639. {
  640. //model a round corner
  641. alphaGrad<21, 100>(out.template ref<1, 1>(), col); //exact: 1 - pi/4 = 0.2146018366
  642. }
  643. };
  644. template <class ColorGradient>
  645. struct Scaler3x : public ColorGradient
  646. {
  647. static const int scale = 3;
  648. template <unsigned int M, unsigned int N> //bring template function into scope for GCC
  649. static void alphaGrad(uint32_t& pixBack, uint32_t pixFront) { ColorGradient::template alphaGrad<M, N>(pixBack, pixFront); }
  650. template <class OutputMatrix>
  651. static void blendLineShallow(uint32_t col, OutputMatrix& out)
  652. {
  653. alphaGrad<1, 4>(out.template ref<scale - 1, 0>(), col);
  654. alphaGrad<1, 4>(out.template ref<scale - 2, 2>(), col);
  655. alphaGrad<3, 4>(out.template ref<scale - 1, 1>(), col);
  656. out.template ref<scale - 1, 2>() = col;
  657. }
  658. template <class OutputMatrix>
  659. static void blendLineSteep(uint32_t col, OutputMatrix& out)
  660. {
  661. alphaGrad<1, 4>(out.template ref<0, scale - 1>(), col);
  662. alphaGrad<1, 4>(out.template ref<2, scale - 2>(), col);
  663. alphaGrad<3, 4>(out.template ref<1, scale - 1>(), col);
  664. out.template ref<2, scale - 1>() = col;
  665. }
  666. template <class OutputMatrix>
  667. static void blendLineSteepAndShallow(uint32_t col, OutputMatrix& out)
  668. {
  669. alphaGrad<1, 4>(out.template ref<2, 0>(), col);
  670. alphaGrad<1, 4>(out.template ref<0, 2>(), col);
  671. alphaGrad<3, 4>(out.template ref<2, 1>(), col);
  672. alphaGrad<3, 4>(out.template ref<1, 2>(), col);
  673. out.template ref<2, 2>() = col;
  674. }
  675. template <class OutputMatrix>
  676. static void blendLineDiagonal(uint32_t col, OutputMatrix& out)
  677. {
  678. alphaGrad<1, 8>(out.template ref<1, 2>(), col); //conflict with other rotations for this odd scale
  679. alphaGrad<1, 8>(out.template ref<2, 1>(), col);
  680. alphaGrad<7, 8>(out.template ref<2, 2>(), col); //
  681. }
  682. template <class OutputMatrix>
  683. static void blendCorner(uint32_t col, OutputMatrix& out)
  684. {
  685. //model a round corner
  686. alphaGrad<45, 100>(out.template ref<2, 2>(), col); //exact: 0.4545939598
  687. //alphaGrad<7, 256>(out.template ref<2, 1>(), col); //0.02826017254 -> negligible + avoid conflicts with other rotations for this odd scale
  688. //alphaGrad<7, 256>(out.template ref<1, 2>(), col); //0.02826017254
  689. }
  690. };
  691. template <class ColorGradient>
  692. struct Scaler4x : public ColorGradient
  693. {
  694. static const int scale = 4;
  695. template <unsigned int M, unsigned int N> //bring template function into scope for GCC
  696. static void alphaGrad(uint32_t& pixBack, uint32_t pixFront) { ColorGradient::template alphaGrad<M, N>(pixBack, pixFront); }
  697. template <class OutputMatrix>
  698. static void blendLineShallow(uint32_t col, OutputMatrix& out)
  699. {
  700. alphaGrad<1, 4>(out.template ref<scale - 1, 0>(), col);
  701. alphaGrad<1, 4>(out.template ref<scale - 2, 2>(), col);
  702. alphaGrad<3, 4>(out.template ref<scale - 1, 1>(), col);
  703. alphaGrad<3, 4>(out.template ref<scale - 2, 3>(), col);
  704. out.template ref<scale - 1, 2>() = col;
  705. out.template ref<scale - 1, 3>() = col;
  706. }
  707. template <class OutputMatrix>
  708. static void blendLineSteep(uint32_t col, OutputMatrix& out)
  709. {
  710. alphaGrad<1, 4>(out.template ref<0, scale - 1>(), col);
  711. alphaGrad<1, 4>(out.template ref<2, scale - 2>(), col);
  712. alphaGrad<3, 4>(out.template ref<1, scale - 1>(), col);
  713. alphaGrad<3, 4>(out.template ref<3, scale - 2>(), col);
  714. out.template ref<2, scale - 1>() = col;
  715. out.template ref<3, scale - 1>() = col;
  716. }
  717. template <class OutputMatrix>
  718. static void blendLineSteepAndShallow(uint32_t col, OutputMatrix& out)
  719. {
  720. alphaGrad<3, 4>(out.template ref<3, 1>(), col);
  721. alphaGrad<3, 4>(out.template ref<1, 3>(), col);
  722. alphaGrad<1, 4>(out.template ref<3, 0>(), col);
  723. alphaGrad<1, 4>(out.template ref<0, 3>(), col);
  724. alphaGrad<1, 3>(out.template ref<2, 2>(), col); //[!] fixes 1/4 used in xBR
  725. out.template ref<3, 3>() = col;
  726. out.template ref<3, 2>() = col;
  727. out.template ref<2, 3>() = col;
  728. }
  729. template <class OutputMatrix>
  730. static void blendLineDiagonal(uint32_t col, OutputMatrix& out)
  731. {
  732. alphaGrad<1, 2>(out.template ref<scale - 1, scale / 2 >(), col);
  733. alphaGrad<1, 2>(out.template ref<scale - 2, scale / 2 + 1>(), col);
  734. out.template ref<scale - 1, scale - 1>() = col;
  735. }
  736. template <class OutputMatrix>
  737. static void blendCorner(uint32_t col, OutputMatrix& out)
  738. {
  739. //model a round corner
  740. alphaGrad<68, 100>(out.template ref<3, 3>(), col); //exact: 0.6848532563
  741. alphaGrad< 9, 100>(out.template ref<3, 2>(), col); //0.08677704501
  742. alphaGrad< 9, 100>(out.template ref<2, 3>(), col); //0.08677704501
  743. }
  744. };
  745. template <class ColorGradient>
  746. struct Scaler5x : public ColorGradient
  747. {
  748. static const int scale = 5;
  749. template <unsigned int M, unsigned int N> //bring template function into scope for GCC
  750. static void alphaGrad(uint32_t& pixBack, uint32_t pixFront) { ColorGradient::template alphaGrad<M, N>(pixBack, pixFront); }
  751. template <class OutputMatrix>
  752. static void blendLineShallow(uint32_t col, OutputMatrix& out)
  753. {
  754. alphaGrad<1, 4>(out.template ref<scale - 1, 0>(), col);
  755. alphaGrad<1, 4>(out.template ref<scale - 2, 2>(), col);
  756. alphaGrad<1, 4>(out.template ref<scale - 3, 4>(), col);
  757. alphaGrad<3, 4>(out.template ref<scale - 1, 1>(), col);
  758. alphaGrad<3, 4>(out.template ref<scale - 2, 3>(), col);
  759. out.template ref<scale - 1, 2>() = col;
  760. out.template ref<scale - 1, 3>() = col;
  761. out.template ref<scale - 1, 4>() = col;
  762. out.template ref<scale - 2, 4>() = col;
  763. }
  764. template <class OutputMatrix>
  765. static void blendLineSteep(uint32_t col, OutputMatrix& out)
  766. {
  767. alphaGrad<1, 4>(out.template ref<0, scale - 1>(), col);
  768. alphaGrad<1, 4>(out.template ref<2, scale - 2>(), col);
  769. alphaGrad<1, 4>(out.template ref<4, scale - 3>(), col);
  770. alphaGrad<3, 4>(out.template ref<1, scale - 1>(), col);
  771. alphaGrad<3, 4>(out.template ref<3, scale - 2>(), col);
  772. out.template ref<2, scale - 1>() = col;
  773. out.template ref<3, scale - 1>() = col;
  774. out.template ref<4, scale - 1>() = col;
  775. out.template ref<4, scale - 2>() = col;
  776. }
  777. template <class OutputMatrix>
  778. static void blendLineSteepAndShallow(uint32_t col, OutputMatrix& out)
  779. {
  780. alphaGrad<1, 4>(out.template ref<0, scale - 1>(), col);
  781. alphaGrad<1, 4>(out.template ref<2, scale - 2>(), col);
  782. alphaGrad<3, 4>(out.template ref<1, scale - 1>(), col);
  783. alphaGrad<1, 4>(out.template ref<scale - 1, 0>(), col);
  784. alphaGrad<1, 4>(out.template ref<scale - 2, 2>(), col);
  785. alphaGrad<3, 4>(out.template ref<scale - 1, 1>(), col);
  786. alphaGrad<2, 3>(out.template ref<3, 3>(), col);
  787. out.template ref<2, scale - 1>() = col;
  788. out.template ref<3, scale - 1>() = col;
  789. out.template ref<4, scale - 1>() = col;
  790. out.template ref<scale - 1, 2>() = col;
  791. out.template ref<scale - 1, 3>() = col;
  792. }
  793. template <class OutputMatrix>
  794. static void blendLineDiagonal(uint32_t col, OutputMatrix& out)
  795. {
  796. alphaGrad<1, 8>(out.template ref<scale - 1, scale / 2 >(), col); //conflict with other rotations for this odd scale
  797. alphaGrad<1, 8>(out.template ref<scale - 2, scale / 2 + 1>(), col);
  798. alphaGrad<1, 8>(out.template ref<scale - 3, scale / 2 + 2>(), col); //
  799. alphaGrad<7, 8>(out.template ref<4, 3>(), col);
  800. alphaGrad<7, 8>(out.template ref<3, 4>(), col);
  801. out.template ref<4, 4>() = col;
  802. }
  803. template <class OutputMatrix>
  804. static void blendCorner(uint32_t col, OutputMatrix& out)
  805. {
  806. //model a round corner
  807. alphaGrad<86, 100>(out.template ref<4, 4>(), col); //exact: 0.8631434088
  808. alphaGrad<23, 100>(out.template ref<4, 3>(), col); //0.2306749731
  809. alphaGrad<23, 100>(out.template ref<3, 4>(), col); //0.2306749731
  810. //alphaGrad<1, 64>(out.template ref<4, 2>(), col); //0.01676812367 -> negligible + avoid conflicts with other rotations for this odd scale
  811. //alphaGrad<1, 64>(out.template ref<2, 4>(), col); //0.01676812367
  812. }
  813. };
  814. template <class ColorGradient>
  815. struct Scaler6x : public ColorGradient
  816. {
  817. static const int scale = 6;
  818. template <unsigned int M, unsigned int N> //bring template function into scope for GCC
  819. static void alphaGrad(uint32_t& pixBack, uint32_t pixFront) { ColorGradient::template alphaGrad<M, N>(pixBack, pixFront); }
  820. template <class OutputMatrix>
  821. static void blendLineShallow(uint32_t col, OutputMatrix& out)
  822. {
  823. alphaGrad<1, 4>(out.template ref<scale - 1, 0>(), col);
  824. alphaGrad<1, 4>(out.template ref<scale - 2, 2>(), col);
  825. alphaGrad<1, 4>(out.template ref<scale - 3, 4>(), col);
  826. alphaGrad<3, 4>(out.template ref<scale - 1, 1>(), col);
  827. alphaGrad<3, 4>(out.template ref<scale - 2, 3>(), col);
  828. alphaGrad<3, 4>(out.template ref<scale - 3, 5>(), col);
  829. out.template ref<scale - 1, 2>() = col;
  830. out.template ref<scale - 1, 3>() = col;
  831. out.template ref<scale - 1, 4>() = col;
  832. out.template ref<scale - 1, 5>() = col;
  833. out.template ref<scale - 2, 4>() = col;
  834. out.template ref<scale - 2, 5>() = col;
  835. }
  836. template <class OutputMatrix>
  837. static void blendLineSteep(uint32_t col, OutputMatrix& out)
  838. {
  839. alphaGrad<1, 4>(out.template ref<0, scale - 1>(), col);
  840. alphaGrad<1, 4>(out.template ref<2, scale - 2>(), col);
  841. alphaGrad<1, 4>(out.template ref<4, scale - 3>(), col);
  842. alphaGrad<3, 4>(out.template ref<1, scale - 1>(), col);
  843. alphaGrad<3, 4>(out.template ref<3, scale - 2>(), col);
  844. alphaGrad<3, 4>(out.template ref<5, scale - 3>(), col);
  845. out.template ref<2, scale - 1>() = col;
  846. out.template ref<3, scale - 1>() = col;
  847. out.template ref<4, scale - 1>() = col;
  848. out.template ref<5, scale - 1>() = col;
  849. out.template ref<4, scale - 2>() = col;
  850. out.template ref<5, scale - 2>() = col;
  851. }
  852. template <class OutputMatrix>
  853. static void blendLineSteepAndShallow(uint32_t col, OutputMatrix& out)
  854. {
  855. alphaGrad<1, 4>(out.template ref<0, scale - 1>(), col);
  856. alphaGrad<1, 4>(out.template ref<2, scale - 2>(), col);
  857. alphaGrad<3, 4>(out.template ref<1, scale - 1>(), col);
  858. alphaGrad<3, 4>(out.template ref<3, scale - 2>(), col);
  859. alphaGrad<1, 4>(out.template ref<scale - 1, 0>(), col);
  860. alphaGrad<1, 4>(out.template ref<scale - 2, 2>(), col);
  861. alphaGrad<3, 4>(out.template ref<scale - 1, 1>(), col);
  862. alphaGrad<3, 4>(out.template ref<scale - 2, 3>(), col);
  863. out.template ref<2, scale - 1>() = col;
  864. out.template ref<3, scale - 1>() = col;
  865. out.template ref<4, scale - 1>() = col;
  866. out.template ref<5, scale - 1>() = col;
  867. out.template ref<4, scale - 2>() = col;
  868. out.template ref<5, scale - 2>() = col;
  869. out.template ref<scale - 1, 2>() = col;
  870. out.template ref<scale - 1, 3>() = col;
  871. }
  872. template <class OutputMatrix>
  873. static void blendLineDiagonal(uint32_t col, OutputMatrix& out)
  874. {
  875. alphaGrad<1, 2>(out.template ref<scale - 1, scale / 2 >(), col);
  876. alphaGrad<1, 2>(out.template ref<scale - 2, scale / 2 + 1>(), col);
  877. alphaGrad<1, 2>(out.template ref<scale - 3, scale / 2 + 2>(), col);
  878. out.template ref<scale - 2, scale - 1>() = col;
  879. out.template ref<scale - 1, scale - 1>() = col;
  880. out.template ref<scale - 1, scale - 2>() = col;
  881. }
  882. template <class OutputMatrix>
  883. static void blendCorner(uint32_t col, OutputMatrix& out)
  884. {
  885. //model a round corner
  886. alphaGrad<97, 100>(out.template ref<5, 5>(), col); //exact: 0.9711013910
  887. alphaGrad<42, 100>(out.template ref<4, 5>(), col); //0.4236372243
  888. alphaGrad<42, 100>(out.template ref<5, 4>(), col); //0.4236372243
  889. alphaGrad< 6, 100>(out.template ref<5, 3>(), col); //0.05652034508
  890. alphaGrad< 6, 100>(out.template ref<3, 5>(), col); //0.05652034508
  891. }
  892. };
  893. //------------------------------------------------------------------------------------
  894. struct ColorDistanceRGB
  895. {
  896. static double dist(uint32_t pix1, uint32_t pix2, double luminanceWeight)
  897. {
  898. return distYCbCrBuffered(pix1, pix2);
  899. //if (pix1 == pix2) //about 4% perf boost
  900. // return 0;
  901. //return distYCbCr(pix1, pix2, luminanceWeight);
  902. }
  903. };
  904. struct ColorDistanceARGB
  905. {
  906. static double dist(uint32_t pix1, uint32_t pix2, double luminanceWeight)
  907. {
  908. const double a1 = getAlpha(pix1) / 255.0 ;
  909. const double a2 = getAlpha(pix2) / 255.0 ;
  910. /*
  911. Requirements for a color distance handling alpha channel: with a1, a2 in [0, 1]
  912. 1. if a1 = a2, distance should be: a1 * distYCbCr()
  913. 2. if a1 = 0, distance should be: a2 * distYCbCr(black, white) = a2 * 255
  914. 3. if a1 = 1, ??? maybe: 255 * (1 - a2) + a2 * distYCbCr()
  915. */
  916. //return std::min(a1, a2) * distYCbCrBuffered(pix1, pix2) + 255 * abs(a1 - a2);
  917. //=> following code is 15% faster:
  918. const double d = distYCbCrBuffered(pix1, pix2);
  919. if (a1 < a2)
  920. return a1 * d + 255 * (a2 - a1);
  921. else
  922. return a2 * d + 255 * (a1 - a2);
  923. //alternative? return std::sqrt(a1 * a2 * square(distYCbCrBuffered(pix1, pix2)) + square(255 * (a1 - a2)));
  924. }
  925. };
  926. struct ColorDistanceUnbufferedARGB
  927. {
  928. static double dist(uint32_t pix1, uint32_t pix2, double luminanceWeight)
  929. {
  930. const double a1 = getAlpha(pix1) / 255.0 ;
  931. const double a2 = getAlpha(pix2) / 255.0 ;
  932. const double d = distYCbCr(pix1, pix2, luminanceWeight);
  933. if (a1 < a2)
  934. return a1 * d + 255 * (a2 - a1);
  935. else
  936. return a2 * d + 255 * (a1 - a2);
  937. }
  938. };
  939. struct ColorGradientRGB
  940. {
  941. template <unsigned int M, unsigned int N>
  942. static void alphaGrad(uint32_t& pixBack, uint32_t pixFront)
  943. {
  944. pixBack = gradientRGB<M, N>(pixFront, pixBack);
  945. }
  946. };
  947. struct ColorGradientARGB
  948. {
  949. template <unsigned int M, unsigned int N>
  950. static void alphaGrad(uint32_t& pixBack, uint32_t pixFront)
  951. {
  952. pixBack = gradientARGB<M, N>(pixFront, pixBack);
  953. }
  954. };
  955. }
  956. void xbrz::scale(size_t factor, const uint32_t* src, uint32_t* trg, int srcWidth, int srcHeight, ColorFormat colFmt, const xbrz::ScalerCfg& cfg, int yFirst, int yLast)
  957. {
  958. if (factor == 1)
  959. {
  960. std::copy(src + yFirst * srcWidth, src + yLast * srcWidth, trg);
  961. return;
  962. }
  963. static_assert(SCALE_FACTOR_MAX == 6);
  964. switch (colFmt)
  965. {
  966. case ColorFormat::RGB:
  967. switch (factor)
  968. {
  969. case 2:
  970. return scaleImage<Scaler2x<ColorGradientRGB>, ColorDistanceRGB, OobReaderDuplicate>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
  971. case 3:
  972. return scaleImage<Scaler3x<ColorGradientRGB>, ColorDistanceRGB, OobReaderDuplicate>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
  973. case 4:
  974. return scaleImage<Scaler4x<ColorGradientRGB>, ColorDistanceRGB, OobReaderDuplicate>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
  975. case 5:
  976. return scaleImage<Scaler5x<ColorGradientRGB>, ColorDistanceRGB, OobReaderDuplicate>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
  977. case 6:
  978. return scaleImage<Scaler6x<ColorGradientRGB>, ColorDistanceRGB, OobReaderDuplicate>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
  979. }
  980. break;
  981. case ColorFormat::ARGB_CLAMPED:
  982. switch (factor)
  983. {
  984. case 2:
  985. return scaleImage<Scaler2x<ColorGradientARGB>, ColorDistanceARGB, OobReaderDuplicate>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
  986. case 3:
  987. return scaleImage<Scaler3x<ColorGradientARGB>, ColorDistanceARGB, OobReaderDuplicate>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
  988. case 4:
  989. return scaleImage<Scaler4x<ColorGradientARGB>, ColorDistanceARGB, OobReaderDuplicate>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
  990. case 5:
  991. return scaleImage<Scaler5x<ColorGradientARGB>, ColorDistanceARGB, OobReaderDuplicate>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
  992. case 6:
  993. return scaleImage<Scaler6x<ColorGradientARGB>, ColorDistanceARGB, OobReaderDuplicate>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
  994. }
  995. break;
  996. case ColorFormat::ARGB:
  997. switch (factor)
  998. {
  999. case 2:
  1000. return scaleImage<Scaler2x<ColorGradientARGB>, ColorDistanceARGB, OobReaderTransparent>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
  1001. case 3:
  1002. return scaleImage<Scaler3x<ColorGradientARGB>, ColorDistanceARGB, OobReaderTransparent>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
  1003. case 4:
  1004. return scaleImage<Scaler4x<ColorGradientARGB>, ColorDistanceARGB, OobReaderTransparent>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
  1005. case 5:
  1006. return scaleImage<Scaler5x<ColorGradientARGB>, ColorDistanceARGB, OobReaderTransparent>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
  1007. case 6:
  1008. return scaleImage<Scaler6x<ColorGradientARGB>, ColorDistanceARGB, OobReaderTransparent>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
  1009. }
  1010. break;
  1011. case ColorFormat::ARGB_UNBUFFERED:
  1012. switch (factor)
  1013. {
  1014. case 2:
  1015. return scaleImage<Scaler2x<ColorGradientARGB>, ColorDistanceUnbufferedARGB, OobReaderTransparent>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
  1016. case 3:
  1017. return scaleImage<Scaler3x<ColorGradientARGB>, ColorDistanceUnbufferedARGB, OobReaderTransparent>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
  1018. case 4:
  1019. return scaleImage<Scaler4x<ColorGradientARGB>, ColorDistanceUnbufferedARGB, OobReaderTransparent>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
  1020. case 5:
  1021. return scaleImage<Scaler5x<ColorGradientARGB>, ColorDistanceUnbufferedARGB, OobReaderTransparent>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
  1022. case 6:
  1023. return scaleImage<Scaler6x<ColorGradientARGB>, ColorDistanceUnbufferedARGB, OobReaderTransparent>(src, trg, srcWidth, srcHeight, cfg, yFirst, yLast);
  1024. }
  1025. break;
  1026. }
  1027. assert(false);
  1028. }
  1029. bool xbrz::equalColorTest(uint32_t col1, uint32_t col2, ColorFormat colFmt, double luminanceWeight, double equalColorTolerance)
  1030. {
  1031. switch (colFmt)
  1032. {
  1033. case ColorFormat::RGB:
  1034. return ColorDistanceRGB::dist(col1, col2, luminanceWeight) < equalColorTolerance;
  1035. case ColorFormat::ARGB:
  1036. case ColorFormat::ARGB_CLAMPED:
  1037. return ColorDistanceARGB::dist(col1, col2, luminanceWeight) < equalColorTolerance;
  1038. case ColorFormat::ARGB_UNBUFFERED:
  1039. return ColorDistanceUnbufferedARGB::dist(col1, col2, luminanceWeight) < equalColorTolerance;
  1040. }
  1041. assert(false);
  1042. return false;
  1043. }
  1044. void xbrz::bilinearScale(const uint32_t* src, int srcWidth, int srcHeight,
  1045. /**/ uint32_t* trg, int trgWidth, int trgHeight)
  1046. {
  1047. bilinearScale(src, srcWidth, srcHeight, srcWidth * sizeof(uint32_t),
  1048. trg, trgWidth, trgHeight, trgWidth * sizeof(uint32_t),
  1049. 0, trgHeight, [](uint32_t pix) { return pix; });
  1050. }
  1051. void xbrz::nearestNeighborScale(const uint32_t* src, int srcWidth, int srcHeight,
  1052. /**/ uint32_t* trg, int trgWidth, int trgHeight)
  1053. {
  1054. nearestNeighborScale(src, srcWidth, srcHeight, srcWidth * sizeof(uint32_t),
  1055. trg, trgWidth, trgHeight, trgWidth * sizeof(uint32_t),
  1056. 0, trgHeight, [](uint32_t pix) { return pix; });
  1057. }
  1058. #if 0
  1059. //#include <ppl.h>
  1060. void bilinearScaleCpu(const uint32_t* src, int srcWidth, int srcHeight,
  1061. /**/ uint32_t* trg, int trgWidth, int trgHeight)
  1062. {
  1063. const int TASK_GRANULARITY = 16;
  1064. concurrency::task_group tg;
  1065. for (int i = 0; i < trgHeight; i += TASK_GRANULARITY)
  1066. tg.run([=]
  1067. {
  1068. const int iLast = std::min(i + TASK_GRANULARITY, trgHeight);
  1069. xbrz::bilinearScale(src, srcWidth, srcHeight, srcWidth * sizeof(uint32_t),
  1070. trg, trgWidth, trgHeight, trgWidth * sizeof(uint32_t),
  1071. i, iLast, [](uint32_t pix) { return pix; });
  1072. });
  1073. tg.wait();
  1074. }
  1075. //Perf: AMP vs CPU: merely ~10% shorter runtime (scaling 1280x800 -> 1920x1080)
  1076. //#include <amp.h>
  1077. void bilinearScaleAmp(const uint32_t* src, int srcWidth, int srcHeight, //throw concurrency::runtime_exception
  1078. /**/ uint32_t* trg, int trgWidth, int trgHeight)
  1079. {
  1080. //C++ AMP reference: https://msdn.microsoft.com/en-us/library/hh289390.aspx
  1081. //introduction to C++ AMP: https://msdn.microsoft.com/en-us/magazine/hh882446.aspx
  1082. using namespace concurrency;
  1083. //TODO: pitch
  1084. if (srcHeight <= 0 || srcWidth <= 0) return;
  1085. const float scaleX = static_cast<float>(trgWidth ) / srcWidth;
  1086. const float scaleY = static_cast<float>(trgHeight) / srcHeight;
  1087. array_view<const uint32_t, 2> srcView(srcHeight, srcWidth, src);
  1088. array_view< uint32_t, 2> trgView(trgHeight, trgWidth, trg);
  1089. trgView.discard_data();
  1090. parallel_for_each(trgView.extent, [=](index<2> idx) restrict(amp) //throw ?
  1091. {
  1092. const int y = idx[0];
  1093. const int x = idx[1];
  1094. //Perf notes:
  1095. // -> float-based calculation is (almost) 2x as fas as double!
  1096. // -> no noticeable improvement via tiling: https://msdn.microsoft.com/en-us/magazine/hh882447.aspx
  1097. // -> no noticeable improvement with restrict(amp,cpu)
  1098. // -> iterating over y-axis only is significantly slower!
  1099. // -> pre-calculating x,y-dependent variables in a buffer + array_view<> is ~ 20 % slower!
  1100. const int y1 = srcHeight * y / trgHeight;
  1101. int y2 = y1 + 1;
  1102. if (y2 == srcHeight) --y2;
  1103. const float yy1 = y / scaleY - y1;
  1104. const float y2y = 1 - yy1;
  1105. //-------------------------------------
  1106. const int x1 = srcWidth * x / trgWidth;
  1107. int x2 = x1 + 1;
  1108. if (x2 == srcWidth) --x2;
  1109. const float xx1 = x / scaleX - x1;
  1110. const float x2x = 1 - xx1;
  1111. //-------------------------------------
  1112. const float x2xy2y = x2x * y2y;
  1113. const float xx1y2y = xx1 * y2y;
  1114. const float x2xyy1 = x2x * yy1;
  1115. const float xx1yy1 = xx1 * yy1;
  1116. auto interpolate = [=](int offset)
  1117. {
  1118. /*
  1119. https://en.wikipedia.org/wiki/Bilinear_interpolation
  1120. (c11(x2 - x) + c21(x - x1)) * (y2 - y ) +
  1121. (c12(x2 - x) + c22(x - x1)) * (y - y1)
  1122. */
  1123. const auto c11 = (srcView(y1, x1) >> (8 * offset)) & 0xff;
  1124. const auto c21 = (srcView(y1, x2) >> (8 * offset)) & 0xff;
  1125. const auto c12 = (srcView(y2, x1) >> (8 * offset)) & 0xff;
  1126. const auto c22 = (srcView(y2, x2) >> (8 * offset)) & 0xff;
  1127. return c11 * x2xy2y + c21 * xx1y2y +
  1128. c12 * x2xyy1 + c22 * xx1yy1;
  1129. };
  1130. const float bi = interpolate(0);
  1131. const float gi = interpolate(1);
  1132. const float ri = interpolate(2);
  1133. const float ai = interpolate(3);
  1134. const auto b = static_cast<uint32_t>(bi + 0.5f);
  1135. const auto g = static_cast<uint32_t>(gi + 0.5f);
  1136. const auto r = static_cast<uint32_t>(ri + 0.5f);
  1137. const auto a = static_cast<uint32_t>(ai + 0.5f);
  1138. trgView(y, x) = (a << 24) | (r << 16) | (g << 8) | b;
  1139. });
  1140. trgView.synchronize(); //throw ?
  1141. }
  1142. #endif