Browse Source

改进字幕编码格式检测和转换 UTF-8 的方法,应该会部分提高正确率

Signed-off-by: allan716 <[email protected]>
allan716 4 years ago
parent
commit
77dc6a0bbb
4 changed files with 66 additions and 22 deletions
  1. 3 1
      go.mod
  2. 6 0
      go.sum
  3. 21 8
      model/language.go
  4. 36 13
      sub_parser/ass/ass_test.go

+ 3 - 1
go.mod

@@ -19,10 +19,12 @@ require (
 	github.com/mholt/archiver/v3 v3.5.0
 	github.com/middelink/go-parse-torrent-name v0.0.0-20190301154245-3ff4efacd4c4
 	github.com/nfnt/resize v0.0.0-20180221191011-83c6a9932646
+	github.com/nzlov/chardet v0.0.0-20190815145004-022cbcf483f9 // indirect
 	github.com/panjf2000/ants/v2 v2.4.5
 	github.com/pkg/errors v0.9.1 // indirect
+	github.com/qiniu/iconv v1.2.0 // indirect
 	github.com/robfig/cron/v3 v3.0.0
-	github.com/saintfish/chardet v0.0.0-20120816061221-3af4cd4741ca
+	github.com/saintfish/chardet v0.0.0-20120816061221-3af4cd4741ca // indirect
 	github.com/sirupsen/logrus v1.8.1
 	github.com/spf13/viper v1.7.1
 	github.com/t-tomalak/logrus-easy-formatter v0.0.0-20190827215021-c074f06c5816

+ 6 - 0
go.sum

@@ -56,6 +56,8 @@ github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c
 github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 github.com/dgrijalva/jwt-go v3.2.0+incompatible/go.mod h1:E3ru+11k8xSBh+hMPgOLZmtrrCbhqsmaPHjLKYnJCaQ=
 github.com/dgryski/go-sip13 v0.0.0-20181026042036-e10d5fee7954/go.mod h1:vAd38F8PWV+bWy6jNmig1y/TA+kYO4g3RSRF0IAv0no=
+github.com/djimenez/iconv-go v0.0.0-20160305225143-8960e66bd3da h1:0qwwqQCLOOXPl58ljnq3sTJR7yRuMolM02vjxDh4ZVE=
+github.com/djimenez/iconv-go v0.0.0-20160305225143-8960e66bd3da/go.mod h1:ns+zIWBBchgfRdxNgIJWn2x6U95LQchxeqiN5Cgdgts=
 github.com/dsnet/compress v0.0.1 h1:PlZu0n3Tuv04TzpfPbrnI0HW/YwodEXDS+oPKahKF0Q=
 github.com/dsnet/compress v0.0.1/go.mod h1:Aw8dCMJ7RioblQeTqt88akK31OvO8Dhf5JflhBbQEHo=
 github.com/dsnet/golib v0.0.0-20171103203638-1ea166775780/go.mod h1:Lj+Z9rebOhdfkVLjJ8T6VcRQv3SXugXy999NBtR9aFY=
@@ -180,6 +182,8 @@ github.com/nfnt/resize v0.0.0-20180221191011-83c6a9932646 h1:zYyBkD/k9seD2A7fsi6
 github.com/nfnt/resize v0.0.0-20180221191011-83c6a9932646/go.mod h1:jpp1/29i3P1S/RLdc7JQKbRpFeM1dOBd8T9ki5s+AY8=
 github.com/nwaples/rardecode v1.1.0 h1:vSxaY8vQhOcVr4mm5e8XllHWTiM4JF507A0Katqw7MQ=
 github.com/nwaples/rardecode v1.1.0/go.mod h1:5DzqNKiOdpKKBH87u8VlvAnPZMXcGRhxWkRpHbbfGS0=
+github.com/nzlov/chardet v0.0.0-20190815145004-022cbcf483f9 h1:Fr9qvoS1YZWpT+tlwMd2ILFxhcYQZ9neP4zbw7J25zQ=
+github.com/nzlov/chardet v0.0.0-20190815145004-022cbcf483f9/go.mod h1:1TeXuGo9we3KPG7S3u3TfJ3KcPbM2j2dOGy56Y6ktnI=
 github.com/oklog/ulid v1.3.1/go.mod h1:CirwcVhetQ6Lv90oh/F+FBtV6XMibvdAFo93nm5qn4U=
 github.com/panjf2000/ants/v2 v2.4.5 h1:kcGvjXB7ea0MrzzszpnlVFthhYKoFxLi75nRbsq01HY=
 github.com/panjf2000/ants/v2 v2.4.5/go.mod h1:f6F0NZVFsGCp5A7QW/Zj/m92atWwOkY0OIhFxRNFr4A=
@@ -204,6 +208,8 @@ github.com/prometheus/common v0.4.0/go.mod h1:TNfzLD0ON7rHzMJeJkieUDPYmFC7Snx/y8
 github.com/prometheus/procfs v0.0.0-20181005140218-185b4288413d/go.mod h1:c3At6R/oaqEKCNdg8wHV1ftS6bRYblBhIjjI8uT2IGk=
 github.com/prometheus/procfs v0.0.0-20190507164030-5867b95ac084/go.mod h1:TjEm7ze935MbeOT/UhFTIMYKhuLP4wbCsTZCD3I8kEA=
 github.com/prometheus/tsdb v0.7.1/go.mod h1:qhTCs0VvXwvX/y3TZrWD7rabWM+ijKTux40TwIPHuXU=
+github.com/qiniu/iconv v1.2.0 h1:2LJKwoF+4LJ3lNM+7cE3P1kNQzAI/HMZuWhkmFoY2U8=
+github.com/qiniu/iconv v1.2.0/go.mod h1:5bxb2h9lptZt2eHLgY+Jw4X06TMtKb6tvvok0DwSwGA=
 github.com/robfig/cron/v3 v3.0.0 h1:kQ6Cb7aHOHTSzNVNEhmp8EcWKLb4CbiMW9h9VyIhO4E=
 github.com/robfig/cron/v3 v3.0.0/go.mod h1:eQICP3HwyT7UooqI/z+Ov+PtYAWygg1TEWWzGIFLtro=
 github.com/rogpeppe/fastuuid v0.0.0-20150106093220-6724a57986af/go.mod h1:XWv6SoW27p1b0cqNHllgS5HIMJraePCO15w5zCzIWYg=

+ 21 - 8
model/language.go

@@ -5,6 +5,8 @@ import (
 	"github.com/allanpk716/ChineseSubFinder/common"
 	"github.com/axgle/mahonia"
 	"github.com/go-creed/sat"
+	chardet2 "github.com/nzlov/chardet"
+	"github.com/qiniu/iconv"
 	"github.com/saintfish/chardet"
 	"strings"
 )
@@ -341,19 +343,30 @@ func ConvertToString(src string, srcCode string, tagCode string) string {
 
 // ChangeFileCoding2UTF8 自动检测文件的编码,然后转换到 UTF-8
 func ChangeFileCoding2UTF8(inBytes []byte) ([]byte, error) {
-	detector := chardet.NewTextDetector()
-	result, err := detector.DetectBest(inBytes)
+	best, err := detector.DetectBest(inBytes)
 	if err != nil {
-		return nil ,err
+		return nil, err
 	}
-	ouBytes := inBytes
-	if result.Charset != "UTF-8" {
-		ouString := ConvertToString(string(inBytes), result.Charset, "UTF-8")
-		ouBytes = []byte(ouString)
+	var cd iconv.Iconv
+	if best.Confidence < 90 {
+		detectBest := chardet2.Mostlike(inBytes)
+		cd, err = iconv.Open("utf-8", detectBest)
+	} else {
+		cd, err = iconv.Open("utf-8", best.Charset)
+	}
+	if err != nil {
+		return nil, err
+	}
+	defer cd.Close()
+
+	utf8String := cd.ConvString(string(inBytes))
+	if utf8String == "" {
+		return inBytes, nil
 	}
-	return ouBytes, nil
+	return []byte(utf8String), nil
 }
 
 var (
 	chDict = sat.DefaultDict()
+	detector = chardet.NewTextDetector()
 )

+ 36 - 13
sub_parser/ass/ass_test.go

@@ -4,18 +4,41 @@ import (
 	"testing"
 )
 
-func TestParser_DetermineFileType(t *testing.T) {
+func TestParser_DetermineFileTypeFromFile(t *testing.T) {
+	type args struct {
+		filePath string
+	}
+	tests := []struct {
+		name    string
+		args    args
+		wantNil bool
+		wantErr bool
+	}{
+		{name: "1", args: args{filePath: "C:\\Tmp\\Loki - S01E01 - Glorious Purpose WEBDL-1080p Proper.chs[subhd].ass"}, wantNil: false, wantErr: false},
+		{name: "3", args: args{filePath: "C:\\tmp\\oslo.2021.1080p.web.h264-naisu.简体&英文.ass"}, wantNil: false, wantErr: false},
+		{name: "4", args: args{filePath: "C:\\Tmp\\oslo.2021.1080p.web.h264-naisu.繁体&英文.ass"}, wantNil: false, wantErr: false},
+		{name: "5", args: args{filePath: "C:\\Tmp\\oslo.2021.1080p.web.h264-naisu.繁体.ass"}, wantNil: false, wantErr: false},
+		{name: "6", args: args{filePath: "X:\\连续剧\\黑镜 (2011)\\Season 2\\黑镜 - S02E02 - 白熊.en.ass"}, wantNil: false, wantErr: false},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			p := Parser{}
+			got, err := p.DetermineFileTypeFromFile(tt.args.filePath)
+			if (err != nil) != tt.wantErr {
+				t.Errorf("DetermineFileTypeFromFile() error = %v, wantErr %v", err, tt.wantErr)
+				t.Fatal(err)
+				return
+			}
+
+			if got == nil && tt.wantNil == true {
+
+			} else if got != nil && tt.wantNil == false {
+
+			} else {
+				t.Fatal("DetermineFileTypeFromFile got:", got, "wantNil:", tt.wantNil)
+			}
 
-	//filePath := "C:\\Tmp\\saw9.ass"
-	//filePath := "C:\\tmp\\[zimuku]_0_oslo.2021.1080p.web.h264-naisu.简体&英文.ass"
-	//filePath := "C:\\tmp\\oslo.2021.1080p.web.h264-naisu.简体&英文.ass"
-	//filePath := "C:\\Tmp\\Loki - S01E01 - Glorious Purpose WEBDL-1080p Proper.chs[subhd].ass"
-	//filePath := "C:\\Tmp\\oslo.2021.1080p.web.h264-naisu.繁体&英文.ass"
-	filePath := "C:\\Tmp\\oslo.2021.1080p.web.h264-naisu.繁体.ass"
-	parser := NewParser()
-	sfi, err := parser.DetermineFileTypeFromFile(filePath)
-	if err != nil {
-		t.Fatal(err)
+			println(got.Name, got.Ext, got.Lang)
+		})
 	}
-	println(sfi.Name, sfi.Lang.String(), sfi.Ext)
-}
+}