folding_test.go 2.8 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182
  1. // Copyright (C) 2017 The Syncthing Authors.
  2. //
  3. // This Source Code Form is subject to the terms of the Mozilla Public
  4. // License, v. 2.0. If a copy of the MPL was not distributed with this file,
  5. // You can obtain one at https://mozilla.org/MPL/2.0/.
  6. package fs
  7. import (
  8. "testing"
  9. )
  10. var caseCases = [][2]string{
  11. {"", ""},
  12. {"hej", "hej"},
  13. {"HeJ!@#", "hej!@#"},
  14. // Western Europe diacritical stuff is trivial.
  15. {"ÜBERRÄKSMÖRGÅS", "überräksmörgås"},
  16. // As are ligatures.
  17. {"Æglefinus", "æglefinus"},
  18. {"IJssel", "ijssel"},
  19. // Cyrillic seems regular as well.
  20. {"Привет", "привет"},
  21. // Greek has multiple lower case characters for things depending on
  22. // context; we should always choose the same one.
  23. {"Ὀδυσσεύς", "ὀδυσσεύσ"},
  24. {"ὈΔΥΣΣΕΎΣ", "ὀδυσσεύσ"},
  25. // German ß doesn't really have an upper case variant, and we
  26. // shouldn't mess things up when lower casing it either. We don't
  27. // attempt to make ß equivalent to "ss".
  28. {"Reichwaldstraße", "reichwaldstraße"},
  29. // The Turks do their thing with the Is.... Like the Greek example
  30. // we pick just the one canonicalized "i" although you can argue
  31. // with this... From what I understand most operating systems don't
  32. // get this right anyway.
  33. {"İI", "ii"},
  34. // Arabic doesn't do case folding.
  35. {"العَرَبِيَّة", "العَرَبِيَّة"},
  36. // Neither does Hebrew.
  37. {"עברית", "עברית"},
  38. // Nor Chinese, in any variant.
  39. {"汉语/漢語 or 中文", "汉语/漢語 or 中文"},
  40. // Nor katakana, as far as I can tell.
  41. {"チャーハン", "チャーハン"},
  42. // Some special Unicode characters, however, are folded by OSes.
  43. {"\u212A", "k"},
  44. // Folding renormalizes to NFC
  45. {"A\xCC\x88", "\xC3\xA4"}, // ä
  46. {"a\xCC\x88", "\xC3\xA4"}, // ä
  47. }
  48. var benchmarkCases = [][2]string{
  49. {"img_202401241010.jpg", "ASCII lowercase"},
  50. {"IMG_202401241010.jpg", "ASCII mixedcase start"},
  51. {"img_202401241010.JPG", "ASCII mixedcase end"},
  52. {"wir_kinder_aus_bullerbü.epub", "Latin1 lowercase"},
  53. {"Wir_Kinder_aus_Bullerbü.epub", "Latin1 mixedcase start"},
  54. {"wir_kinder_aus_bullerbü.EPUB", "Latin1 mixedcase end"},
  55. {"translated_ウェブの国際化.html", "Unicode lowercase"},
  56. {"Translated_ウェブの国際化.html", "Unicode mixedcase start"},
  57. {"translated_ウェブの国際化.HTML", "Unicode mixedcase end"},
  58. }
  59. func TestUnicodeLowercaseNormalized(t *testing.T) {
  60. for _, tc := range caseCases {
  61. res := UnicodeLowercaseNormalized(tc[0])
  62. if res != tc[1] {
  63. t.Errorf("UnicodeLowercaseNormalized(%q) => %q, expected %q", tc[0], res, tc[1])
  64. }
  65. }
  66. }
  67. func BenchmarkUnicodeLowercase(b *testing.B) {
  68. for _, c := range benchmarkCases {
  69. b.Run(c[1], func(b *testing.B) {
  70. b.ReportAllocs()
  71. for i := 0; i < b.N; i++ {
  72. UnicodeLowercaseNormalized(c[0])
  73. }
  74. })
  75. }
  76. }