1
0

string_encoding.go 1.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657
  1. package language
  2. import (
  3. "github.com/allanpk716/ChineseSubFinder/internal/logic/charset"
  4. "github.com/allanpk716/ChineseSubFinder/internal/pkg/log_helper"
  5. "github.com/axgle/mahonia"
  6. nzlov "github.com/nzlov/chardet"
  7. "strings"
  8. )
  9. // ConvertToString 将字符串从原始编码转换到目标编码,需要配合字符串检测编码库使用 chardet.NewTextDetector()
  10. func ConvertToString(src string, srcCode string, tagCode string) string {
  11. defer func() {
  12. if err := recover(); err != nil {
  13. log_helper.GetLogger().Errorln("ConvertToString panic:", err)
  14. }
  15. }()
  16. srcCoder := mahonia.NewDecoder(srcCode)
  17. srcResult := srcCoder.ConvertString(src)
  18. tagCoder := mahonia.NewDecoder(tagCode)
  19. _, cdata, _ := tagCoder.Translate([]byte(srcResult), true)
  20. result := string(cdata)
  21. return result
  22. }
  23. // 感谢: https://blog.csdn.net/gaoluhua/article/details/109128154,解决了编码问题
  24. // ChangeFileCoding2UTF8 自动检测文件的编码,然后转换到 UTF-8,但是导出 bytes 的时候会把头部的 BOM 信息去除
  25. func ChangeFileCoding2UTF8(inBytes []byte) ([]byte, error) {
  26. best, err := detector.DetectBest(inBytes)
  27. utf8String := ""
  28. if err != nil {
  29. return nil, err
  30. }
  31. if best.Confidence < 90 {
  32. detectBest := nzlov.Mostlike(inBytes)
  33. utf8String, err = charset.ToUTF8(charset.Charset(detectBest), string(inBytes))
  34. } else {
  35. utf8String, err = charset.ToUTF8(charset.Charset(best.Charset), string(inBytes))
  36. }
  37. if err != nil {
  38. return nil, err
  39. }
  40. if utf8String == "" {
  41. return inBytes, nil
  42. }
  43. // 然后返回的时候需要去除头部的 BOM 信息
  44. dat := []byte(utf8String)
  45. if dat[0] == 0xef || dat[1] == 0xbb || dat[2] == 0xbf {
  46. dat = dat[3:]
  47. }
  48. // 在确认一次
  49. validUTF8String := strings.ToValidUTF8(string(dat[:]), "")
  50. return []byte(validUTF8String), nil
  51. }