Slugify.h 8.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125
  1. #pragma once
  2. #include <string>
  3. #include <unordered_map>
  4. #include <regex>
  5. #include <algorithm>
  6. void replace_all(std::wstring& input, const std::wstring& from, const std::wstring& to)
  7. {
  8. size_t pos = 0;
  9. while ((pos = input.find(from, pos)) != std::wstring::npos)
  10. {
  11. input.replace(pos, from.size(), to);
  12. pos += to.size();
  13. }
  14. }
  15. std::wstring trim(const std::wstring &s)
  16. {
  17. std::wstring::const_iterator it = s.begin();
  18. while (it != s.end() && isspace(*it))
  19. it++;
  20. std::wstring::const_reverse_iterator rit = s.rbegin();
  21. while (rit.base() != it && isspace(*rit))
  22. rit++;
  23. return std::wstring(it, rit.base());
  24. }
  25. // SLUGIFY
  26. std::wstring slugify(std::wstring input)
  27. {
  28. std::unordered_map<std::wstring, std::wstring> charMap{
  29. // latin
  30. {_T("À"), _T("A")}, {_T("Á"), _T("A")}, {_T("Â"), _T("A")}, {_T("Ã"), _T("A")}, {_T("Ä"), _T("A")}, {_T("Å"), _T("A")}, {_T("Æ"), _T("AE")}, {
  31. _T("Ç"), _T("C")}, {_T("È"), _T("E")}, {_T("É"), _T("E")}, {_T("Ê"), _T("E")}, {_T("Ë"), _T("E")}, {_T("Ì"), _T("I")}, {_T("Í"), _T("I")}, {
  32. _T("Î"), _T("I")}, {_T("Ï"), _T("I")}, {_T("Ð"), _T("D")}, {_T("Ñ"), _T("N")}, {_T("Ò"), _T("O")}, {_T("Ó"), _T("O")}, {_T("Ô"), _T("O")}, {
  33. _T("Õ"), _T("O")}, {_T("Ö"), _T("O")}, {_T("Ő"), _T("O")}, {_T("Ø"), _T("O")}, {_T("Ù"), _T("U")}, {_T("Ú"), _T("U")}, {_T("Û"), _T("U")}, {
  34. _T("Ü"), _T("U")}, {_T("Ű"), _T("U")}, {_T("Ý"), _T("Y")}, {_T("Þ"), _T("TH")}, {_T("ß"), _T("ss")}, {_T("à"), _T("a")}, {_T("á"), _T("a")}, {
  35. _T("â"), _T("a")}, {_T("ã"), _T("a")}, {_T("ä"), _T("a")}, {_T("å"), _T("a")}, {_T("æ"), _T("ae")}, {_T("ç"), _T("c")}, {_T("è"), _T("e")}, {
  36. _T("é"), _T("e")}, {_T("ê"), _T("e")}, {_T("ë"), _T("e")}, {_T("ì"), _T("i")}, {_T("í"), _T("i")}, {_T("î"), _T("i")}, {_T("ï"), _T("i")}, {
  37. _T("ð"), _T("d")}, {_T("ñ"), _T("n")}, {_T("ò"), _T("o")}, {_T("ó"), _T("o")}, {_T("ô"), _T("o")}, {_T("õ"), _T("o")}, {_T("ö"), _T("o")}, {
  38. _T("ő"), _T("o")}, {_T("ø"), _T("o")}, {_T("ù"), _T("u")}, {_T("ú"), _T("u")}, {_T("û"), _T("u")}, {_T("ü"), _T("u")}, {_T("ű"), _T("u")}, {
  39. _T("ý"), _T("y")}, {_T("þ"), _T("th")}, {_T("ÿ"), _T("y")}, {_T("ẞ"), _T("SS")},
  40. // greek
  41. {_T("α"), _T("a")}, {_T("β"), _T("b")}, {_T("γ"), _T("g")}, {_T("δ"), _T("d")}, {_T("ε"), _T("e")}, {_T("ζ"), _T("z")}, {_T("η"), _T("h")}, {_T("θ"), _T("8")}, {
  42. _T("ι"), _T("i")}, {_T("κ"), _T("k")}, {_T("λ"), _T("l")}, {_T("μ"), _T("m")}, {_T("ν"), _T("n")}, {_T("ξ"), _T("3")}, {_T("ο"), _T("o")}, {_T("π"), _T("p")}, {
  43. _T("ρ"), _T("r")}, {_T("σ"), _T("s")}, {_T("τ"), _T("t")}, {_T("υ"), _T("y")}, {_T("φ"), _T("f")}, {_T("χ"), _T("x")}, {_T("ψ"), _T("ps")}, {_T("ω"), _T("w")}, {
  44. _T("ά"), _T("a")}, {_T("έ"), _T("e")}, {_T("ί"), _T("i")}, {_T("ό"), _T("o")}, {_T("ύ"), _T("y")}, {_T("ή"), _T("h")}, {_T("ώ"), _T("w")}, {_T("ς"), _T("s")}, {
  45. _T("ϊ"), _T("i")}, {_T("ΰ"), _T("y")}, {_T("ϋ"), _T("y")}, {_T("ΐ"), _T("i")}, {
  46. _T("Α"), _T("A")}, {_T("Β"), _T("B")}, {_T("Γ"), _T("G")}, {_T("Δ"), _T("D")}, {_T("Ε"), _T("E")}, {_T("Ζ"), _T("Z")}, {_T("Η"), _T("H")}, {_T("Θ"), _T("8")}, {
  47. _T("Ι"), _T("I")}, {_T("Κ"), _T("K")}, {_T("Λ"), _T("L")}, {_T("Μ"), _T("M")}, {_T("Ν"), _T("N")}, {_T("Ξ"), _T("3")}, {_T("Ο"), _T("O")}, {_T("Π"), _T("P")}, {
  48. _T("Ρ"), _T("R")}, {_T("Σ"), _T("S")}, {_T("Τ"), _T("T")}, {_T("Υ"), _T("Y")}, {_T("Φ"), _T("F")}, {_T("Χ"), _T("X")}, {_T("Ψ"), _T("PS")}, {_T("Ω"), _T("W")}, {
  49. _T("Ά"), _T("A")}, {_T("Έ"), _T("E")}, {_T("Ί"), _T("I")}, {_T("Ό"), _T("O")}, {_T("Ύ"), _T("Y")}, {_T("Ή"), _T("H")}, {_T("Ώ"), _T("W")}, {_T("Ϊ"), _T("I")}, {
  50. _T("Ϋ"), _T("Y")},
  51. // turkish
  52. {_T("ş"), _T("s")}, {_T("Ş"), _T("S")}, {_T("ı"), _T("i")}, {_T("İ"), _T("I")}, {_T("ç"), _T("c")}, {_T("Ç"), _T("C")}, {_T("ü"), _T("u")}, {_T("Ü"), _T("U")}, {
  53. _T("ö"), _T("o")}, {_T("Ö"), _T("O")}, {_T("ğ"), _T("g")}, {_T("Ğ"), _T("G")},
  54. // russian
  55. {_T("а"), _T("a")}, {_T("б"), _T("b")}, {_T("в"), _T("v")}, {_T("г"), _T("g")}, {_T("д"), _T("d")}, {_T("е"), _T("e")}, {_T("ё"), _T("yo")}, {_T("ж"), _T("zh")}, {
  56. _T("з"), _T("z")}, {_T("и"), _T("i")}, {_T("й"), _T("j")}, {_T("к"), _T("k")}, {_T("л"), _T("l")}, {_T("м"), _T("m")}, {_T("н"), _T("n")}, {_T("о"), _T("o")}, {
  57. _T("п"), _T("p")}, {_T("р"), _T("r")}, {_T("с"), _T("s")}, {_T("т"), _T("t")}, {_T("у"), _T("u")}, {_T("ф"), _T("f")}, {_T("х"), _T("h")}, {_T("ц"), _T("c")}, {
  58. _T("ч"), _T("ch")}, {_T("ш"), _T("sh")}, {_T("щ"), _T("sh")}, {_T("ъ"), _T("u")}, {_T("ы"), _T("y")}, {_T("ь"), _T("")}, {_T("э"), _T("e")}, {_T("ю"), _T("yu")}, {
  59. _T("я"), _T("ya")}, {
  60. _T("А"), _T("A")}, {_T("Б"), _T("B")}, {_T("В"), _T("V")}, {_T("Г"), _T("G")}, {_T("Д"), _T("D")}, {_T("Е"), _T("E")}, {_T("Ё"), _T("Yo")}, {_T("Ж"), _T("Zh")}, {
  61. _T("З"), _T("Z")}, {_T("И"), _T("I")}, {_T("Й"), _T("J")}, {_T("К"), _T("K")}, {_T("Л"), _T("L")}, {_T("М"), _T("M")}, {_T("Н"), _T("N")}, {_T("О"), _T("O")}, {
  62. _T("П"), _T("P")}, {_T("Р"), _T("R")}, {_T("С"), _T("S")}, {_T("Т"), _T("T")}, {_T("У"), _T("U")}, {_T("Ф"), _T("F")}, {_T("Х"), _T("H")}, {_T("Ц"), _T("C")}, {
  63. _T("Ч"), _T("Ch")}, {_T("Ш"), _T("Sh")}, {_T("Щ"), _T("Sh")}, {_T("Ъ"), _T("U")}, {_T("Ы"), _T("Y")}, {_T("Ь"), _T("")}, {_T("Э"), _T("E")}, {_T("Ю"), _T("Yu")}, {
  64. _T("Я"), _T("Ya")},
  65. // ukranian
  66. {_T("Є"), _T("Ye")}, {_T("І"), _T("I")}, {_T("Ї"), _T("Yi")}, {_T("Ґ"), _T("G")}, {_T("є"), _T("ye")}, {_T("і"), _T("i")}, {_T("ї"), _T("yi")}, {_T("ґ"), _T("g")},
  67. // czech
  68. {_T("č"), _T("c")}, {_T("ď"), _T("d")}, {_T("ě"), _T("e")}, {_T("ň"), _T("n")}, {_T("ř"), _T("r")}, {_T("š"), _T("s")}, {_T("ť"), _T("t")}, {_T("ů"), _T("u")},
  69. {_T("ž"), _T("z")}, {_T("Č"), _T("C")}, {_T("Ď"), _T("D")}, {_T("Ě"), _T("E")}, {_T("Ň"), _T("N")}, {_T("Ř"), _T("R")}, {_T("Š"), _T("S")}, {_T("Ť"), _T("T")},
  70. {_T("Ů"), _T("U")}, {_T("Ž"), _T("Z")},
  71. // polish
  72. {_T("ą"), _T("a")}, {_T("ć"), _T("c")}, {_T("ę"), _T("e")}, {_T("ł"), _T("l")}, {_T("ń"), _T("n")}, {_T("ó"), _T("o")}, {_T("ś"), _T("s")}, {_T("ź"), _T("z")},
  73. {_T("ż"), _T("z")}, {_T("Ą"), _T("A")}, {_T("Ć"), _T("C")}, {_T("Ę"), _T("e")}, {_T("Ł"), _T("L")}, {_T("Ń"), _T("N")}, {_T("Ś"), _T("S")},
  74. {_T("Ź"), _T("Z")}, {_T("Ż"), _T("Z")},
  75. // latvian
  76. {_T("ā"), _T("a")}, {_T("č"), _T("c")}, {_T("ē"), _T("e")}, {_T("ģ"), _T("g")}, {_T("ī"), _T("i")}, {_T("ķ"), _T("k")}, {_T("ļ"), _T("l")}, {_T("ņ"), _T("n")},
  77. {_T("š"), _T("s")}, {_T("ū"), _T("u")}, {_T("ž"), _T("z")}, {_T("Ā"), _T("A")}, {_T("Č"), _T("C")}, {_T("Ē"), _T("E")}, {_T("Ģ"), _T("G")}, {_T("Ī"), _T("i")},
  78. {_T("Ķ"), _T("k")}, {_T("Ļ"), _T("L")}, {_T("Ņ"), _T("N")}, {_T("Š"), _T("S")}, {_T("Ū"), _T("u")}, {_T("Ž"), _T("Z")},
  79. // currency
  80. {_T("€"), _T("euro")}, {_T("₢"), _T("cruzeiro")}, {_T("₣"), _T("french franc")}, {_T("£"), _T("pound")},
  81. {_T("₤"), _T("lira")}, {_T("₥"), _T("mill")}, {_T("₦"), _T("naira")}, {_T("₧"), _T("peseta")}, {_T("₨"), _T("rupee")},
  82. {_T("₩"), _T("won")}, {_T("₪"), _T("new shequel")}, {_T("₫"), _T("dong")}, {_T("₭"), _T("kip")}, {_T("₮"), _T("tugrik")},
  83. {_T("₯"), _T("drachma")}, {_T("₰"), _T("penny")}, {_T("₱"), _T("peso")}, {_T("₲"), _T("guarani")}, {_T("₳"), _T("austral")},
  84. {_T("₴"), _T("hryvnia")}, {_T("₵"), _T("cedi")}, {_T("¢"), _T("cent")}, {_T("¥"), _T("yen")}, {_T("元"), _T("yuan")},
  85. {_T("円"), _T("yen")}, {_T("﷼"), _T("rial")}, {_T("₠"), _T("ecu")}, {_T("¤"), _T("currency")}, {_T("฿"), _T("baht")}, {_T("$"), _T("dollar")},
  86. // symbols
  87. {_T("©"), _T("(c)")}, {_T("œ"), _T("oe")}, {_T("Œ"), _T("OE")}, {_T("∑"), _T("sum")}, {_T("®"), _T("(r)")}, {_T("†"), _T("+")},
  88. {_T("“"), _T("\"")}, {_T("∂"), _T("d")}, {_T("ƒ"), _T("f")}, {_T("™"), _T("tm")},
  89. {_T("℠"), _T("sm")}, {_T("…"), _T("...")}, {_T("˚"), _T("o")}, {_T("º"), _T("o")}, {_T("ª"), _T("a")}, {_T("•"), _T("*")},
  90. {_T("∆"), _T("delta")}, {_T("∞"), _T("infinity")}, {_T("♥"), _T("love")}, {_T("&"), _T("and")}, {_T("|"), _T("or")},
  91. {_T("<"), _T("less")}, {_T(">"), _T("greater")
  92. }
  93. };
  94. //remove accents
  95. for (auto kv : charMap)
  96. {
  97. replace_all(input, kv.first, kv.second);
  98. }
  99. //make lower case
  100. std::transform(input.begin(), input.end(), input.begin(), ::tolower);
  101. //invalid characters
  102. std::wregex e1(_T("[^a-z0-9-\\s]"));
  103. input = std::regex_replace(input, e1, _T(""));
  104. //convert multiple spaces into one
  105. std::wregex e2(_T("\\s+"));
  106. input = std::regex_replace(input, e2, _T(" "));
  107. trim(input);
  108. //replace spaces with hyphens
  109. std::wregex e3(_T("[-\\s]+"));
  110. input = std::regex_replace(input, e3, _T("-"));
  111. return input;
  112. };