utf.go 1.5 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455
  1. // Copyright (c) Tailscale Inc & AUTHORS
  2. // SPDX-License-Identifier: BSD-3-Clause
  3. package dns
  4. // This code is only used in Windows builds, but is in an
  5. // OS-independent file so tests can run all the time.
  6. import (
  7. "bytes"
  8. "encoding/binary"
  9. "unicode/utf16"
  10. )
  11. // maybeUnUTF16 tries to detect whether bs contains UTF-16, and if so
  12. // translates it to regular UTF-8.
  13. //
  14. // Some of wsl.exe's output get printed as UTF-16, which breaks a
  15. // bunch of things. Try to detect this by looking for a zero byte in
  16. // the first few bytes of output (which will appear if any of those
  17. // codepoints are basic ASCII - very likely). From that we can infer
  18. // that UTF-16 is being printed, and the byte order in use, and we
  19. // decode that back to UTF-8.
  20. //
  21. // https://github.com/microsoft/WSL/issues/4607
  22. func maybeUnUTF16(bs []byte) []byte {
  23. if len(bs)%2 != 0 {
  24. // Can't be complete UTF-16.
  25. return bs
  26. }
  27. checkLen := 20
  28. if len(bs) < checkLen {
  29. checkLen = len(bs)
  30. }
  31. zeroOff := bytes.IndexByte(bs[:checkLen], 0)
  32. if zeroOff == -1 {
  33. return bs
  34. }
  35. // We assume wsl.exe is trying to print an ASCII codepoint,
  36. // meaning the zero byte is in the upper 8 bits of the
  37. // codepoint. That means we can use the zero's byte offset to
  38. // work out if we're seeing little-endian or big-endian
  39. // UTF-16.
  40. var endian binary.ByteOrder = binary.LittleEndian
  41. if zeroOff%2 == 0 {
  42. endian = binary.BigEndian
  43. }
  44. var u16 []uint16
  45. for i := 0; i < len(bs); i += 2 {
  46. u16 = append(u16, endian.Uint16(bs[i:]))
  47. }
  48. return []byte(string(utf16.Decode(u16)))
  49. }