5 yıl önce · 3e24d82513
--- a/lib/fs/folding.go
+++ b/lib/fs/folding.go
@@ -7,13 +7,39 @@
 
				 package fs
			
 
				 
			
 
				 import (
			
 
				+	"strings"
			
 
				 	"unicode"
			
 
				+	"unicode/utf8"
			
 
				 )
			
 
				 
			
 
				 func UnicodeLowercase(s string) string {
			
 
				-	rs := []rune(s)
			
 
				-	for i, r := range rs {
			
 
				-		rs[i] = unicode.ToLower(unicode.ToUpper(r))
			
 
				+	i := firstCaseChange(s)
			
 
				+	if i == -1 {
			
 
				+		return s
			
 
				 	}
			
 
				-	return string(rs)
			
 
				+
			
 
				+	var rs strings.Builder
			
 
				+	// WriteRune always reserves utf8.UTFMax bytes for non-ASCII runes,
			
 
				+	// even if it doesn't need all that space. Overallocate now to prevent
			
 
				+	// it from ever triggering a reallocation.
			
 
				+	rs.Grow(utf8.UTFMax - 1 + len(s))
			
 
				+	rs.WriteString(s[:i])
			
 
				+
			
 
				+	for _, r := range s[i:] {
			
 
				+		rs.WriteRune(unicode.ToLower(unicode.ToUpper(r)))
			
 
				+	}
			
 
				+	return rs.String()
			
 
				+}
			
 
				+
			
 
				+// Byte index of the first rune r s.t. lower(upper(r)) != r.
			
 
				+func firstCaseChange(s string) int {
			
 
				+	for i, r := range s {
			
 
				+		if r <= unicode.MaxASCII && (r < 'A' || r > 'Z') {
			
 
				+			continue
			
 
				+		}
			
 
				+		if unicode.ToLower(unicode.ToUpper(r)) != r {
			
 
				+			return i
			
 
				+		}
			
 
				+	}
			
 
				+	return -1
			
 
				 }
			
--- a/lib/fs/folding_test.go
+++ b/lib/fs/folding_test.go
@@ -6,45 +6,71 @@
 
				 
			
 
				 package fs
			
 
				 
			
 
				-import "testing"
			
 
				+import (
			
 
				+	"testing"
			
 
				+)
			
 
				+
			
 
				+var caseCases = [][2]string{
			
 
				+	{"", ""},
			
 
				+	{"hej", "hej"},
			
 
				+	{"HeJ!@#", "hej!@#"},
			
 
				+	// Western Europe diacritical stuff is trivial.
			
 
				+	{"ÜBERRÄKSMÖRGÅS", "überräksmörgås"},
			
 
				+	// As are ligatures.
			
 
				+	{"Æglefinus", "æglefinus"},
			
 
				+	{"Ĳssel", "ĳssel"},
			
 
				+	// Cyrillic seems regular as well.
			
 
				+	{"Привет", "привет"},
			
 
				+	// Greek has multiple lower case characters for things depending on
			
 
				+	// context; we should always choose the same one.
			
 
				+	{"Ὀδυσσεύς", "ὀδυσσεύσ"},
			
 
				+	{"ὈΔΥΣΣΕΎΣ", "ὀδυσσεύσ"},
			
 
				+	// German ß doesn't really have an upper case variant, and we
			
 
				+	// shouldn't mess things up when lower casing it either. We don't
			
 
				+	// attempt to make ß equivalent to "ss".
			
 
				+	{"Reichwaldstraße", "reichwaldstraße"},
			
 
				+	// The Turks do their thing with the Is.... Like the Greek example
			
 
				+	// we pick just the one canonicalized "i" although you can argue
			
 
				+	// with this... From what I understand most operating systems don't
			
 
				+	// get this right anyway.
			
 
				+	{"İI", "ii"},
			
 
				+	// Arabic doesn't do case folding.
			
 
				+	{"العَرَبِيَّة", "العَرَبِيَّة"},
			
 
				+	// Neither does Hebrew.
			
 
				+	{"עברית", "עברית"},
			
 
				+	// Nor Chinese, in any variant.
			
 
				+	{"汉语/漢語 or 中文", "汉语/漢語 or 中文"},
			
 
				+	// Nor katakana, as far as I can tell.
			
 
				+	{"チャーハン", "チャーハン"},
			
 
				+	// Some special Unicode characters, however, are folded by OSes.
			
 
				+	{"\u212A", "k"},
			
 
				+}
			
 
				 
			
 
				 func TestUnicodeLowercase(t *testing.T) {
			
 
				-	cases := [][2]string{
			
 
				-		{"", ""},
			
 
				-		{"hej", "hej"},
			
 
				-		{"HeJ!@#", "hej!@#"},
			
 
				-		// Western Europe diacritical stuff is trivial
			
 
				-		{"ÜBERRÄKSMÖRGÅS", "überräksmörgås"},
			
 
				-		// Cyrillic seems regular as well
			
 
				-		{"Привет", "привет"},
			
 
				-		// Greek has multiple lower case characters for things depending on
			
 
				-		// context; we should always choose the right one.
			
 
				-		{"Ὀδυσσεύς", "ὀδυσσεύσ"},
			
 
				-		{"ὈΔΥΣΣΕΎΣ", "ὀδυσσεύσ"},
			
 
				-		// German ß doesn't really have an upper case variant, and we
			
 
				-		// shouldn't mess things up when lower casing it either. We don't
			
 
				-		// attempt to make ß equivalent to "ss".
			
 
				-		{"Reichwaldstraße", "reichwaldstraße"},
			
 
				-		// The Turks do their thing with the Is.... Like the Greek example
			
 
				-		// we pick just the one canonicalized "i" although you can argue
			
 
				-		// with this... From what I understand most operating systems don't
			
 
				-		// get this right anyway.
			
 
				-		{"İI", "ii"},
			
 
				-		// Arabic doesn't do case folding.
			
 
				-		{"العَرَبِيَّة", "العَرَبِيَّة"},
			
 
				-		// Neither does Hebrew.
			
 
				-		{"עברית", "עברית"},
			
 
				-		// Nor Chinese, in any variant.
			
 
				-		{"汉语/漢語 or 中文", "汉语/漢語 or 中文"},
			
 
				-		// Niether katakana as far as I can tell.
			
 
				-		{"チャーハン", "チャーハン"},
			
 
				-		// Some special unicode characters, however, are folded by OSes
			
 
				-		{"\u212A", "k"},
			
 
				-	}
			
 
				-	for _, tc := range cases {
			
 
				+	for _, tc := range caseCases {
			
 
				 		res := UnicodeLowercase(tc[0])
			
 
				 		if res != tc[1] {
			
 
				 			t.Errorf("UnicodeLowercase(%q) => %q, expected %q", tc[0], res, tc[1])
			
 
				 		}
			
 
				 	}
			
 
				 }
			
 
				+
			
 
				+func BenchmarkUnicodeLowercaseMaybeChange(b *testing.B) {
			
 
				+	b.ReportAllocs()
			
 
				+
			
 
				+	for i := 0; i < b.N; i++ {
			
 
				+		for _, s := range caseCases {
			
 
				+			UnicodeLowercase(s[0])
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+func BenchmarkUnicodeLowercaseNoChange(b *testing.B) {
			
 
				+	b.ReportAllocs()
			
 
				+
			
 
				+	for i := 0; i < b.N; i++ {
			
 
				+		for _, s := range caseCases {
			
 
				+			UnicodeLowercase(s[1])
			
 
				+		}
			
 
				+	}
			
 
				+}