Explorar o código

解决内置字幕 Dialogue 合并问题

Signed-off-by: 716 <[email protected]>
716 %!s(int64=4) %!d(string=hai) anos
pai
achega
27543919a5

+ 10 - 6
internal/logic/sub_timeline_fixer/sub_timeline_fixer_helper.go

@@ -11,6 +11,7 @@ import (
 	"github.com/allanpk716/ChineseSubFinder/internal/pkg/log_helper"
 	formatterEmby "github.com/allanpk716/ChineseSubFinder/internal/pkg/sub_formatter/emby"
 	"github.com/allanpk716/ChineseSubFinder/internal/pkg/sub_formatter/normal"
+	"github.com/allanpk716/ChineseSubFinder/internal/pkg/sub_helper"
 	"github.com/allanpk716/ChineseSubFinder/internal/pkg/sub_parser_hub"
 	"github.com/allanpk716/ChineseSubFinder/internal/pkg/sub_timeline_fixer"
 	"github.com/allanpk716/ChineseSubFinder/internal/types/emby"
@@ -115,11 +116,13 @@ func (s SubTimelineFixerHelper) fixOneVideoSub(videoId string, videoRootPath str
 		for _, info := range subFixInfos {
 			// 写入 fix 后的字幕文件覆盖之前的字幕文件
 			desFixedSubFullName := path.Join(videoRootPath, info.FileName)
-			err = s.saveSubFile(desFixedSubFullName, info.FixContent)
-			if err != nil {
-				return err
-			}
 			log_helper.GetLogger().Debugln("Sub Timeline fixed:", desFixedSubFullName)
+			continue
+			//err = s.saveSubFile(desFixedSubFullName, info.FixContent)
+			//if err != nil {
+			//	return err
+			//}
+			//log_helper.GetLogger().Debugln("Sub Timeline fixed:", desFixedSubFullName)
 		}
 	}
 
@@ -137,9 +140,10 @@ func (s SubTimelineFixerHelper) fixSubTimeline(enSubFile emby.SubInfo, ch_enSubF
 	}
 	infoBase.Name = enSubFile.FileName
 	/*
-		这里发现一个梗,内置的英文字幕导出的时候,很可能单个 Dialogue 会有 \N 在中间,需要单独去除,拼接成一句话
-		否则后续的两个字幕文件的对白匹配识别的时候会有问题,因为残缺语句
+		这里发现一个梗,内置的英文字幕导出的时候,有可能需要合并多个 Dialogue,见
+		internal/pkg/sub_helper/sub_helper.go 中 MergeMultiDialogue4EngSubtitle 的实现
 	*/
+	sub_helper.MergeMultiDialogue4EngSubtitle(infoBase)
 
 	bFind, infoSrc, err := s.subParserHub.DetermineFileTypeFromBytes(ch_enSubFile.Content, ch_enSubFile.Ext)
 	if err != nil {

+ 87 - 0
internal/pkg/sub_helper/dialogue_merger.go

@@ -0,0 +1,87 @@
+package sub_helper
+
+import "github.com/allanpk716/ChineseSubFinder/internal/types/subparser"
+
+// DialogueMerger 合并分散的对白,目标是搞定英文字幕
+type DialogueMerger struct {
+	dialogueMap   map[string]*subparser.OneDialogueEx
+	dialogueIndex []string
+	lastStartTime string
+}
+
+func NewDialogueMerger() *DialogueMerger {
+	return &DialogueMerger{
+		dialogueMap:   make(map[string]*subparser.OneDialogueEx, 0),
+		dialogueIndex: make([]string, 0),
+		lastStartTime: "",
+	}
+}
+
+func (d *DialogueMerger) Add(inDialogueEx subparser.OneDialogueEx) bool {
+
+	// 第一个首字母是否是大写
+	isUpper := isFirstLetterIsEngUpper(inDialogueEx.EnLine)
+	isLower := isFirstLetterIsEngLower(inDialogueEx.EnLine)
+	if isUpper == true {
+		// 大写就新增
+		d.dialogueMap[inDialogueEx.StartTime] = &inDialogueEx
+		d.lastStartTime = inDialogueEx.StartTime
+		d.dialogueIndex = append(d.dialogueIndex, inDialogueEx.StartTime)
+		return true
+	} else if isLower == true {
+		// 小写就跟上一条的大写进行匹配,看是否能够附加到后面
+		if d.lastStartTime == "" {
+			return false
+		}
+		d.dialogueMap[d.lastStartTime].EnLine += " " + inDialogueEx.EnLine
+		d.lastStartTime = ""
+		return true
+	} else {
+		// 其他情况也新增
+		d.dialogueMap[inDialogueEx.StartTime] = &inDialogueEx
+		d.dialogueIndex = append(d.dialogueIndex, inDialogueEx.StartTime)
+	}
+
+	return false
+}
+
+func (d *DialogueMerger) Clear() {
+	d.lastStartTime = ""
+}
+
+func (d *DialogueMerger) Get() []subparser.OneDialogueEx {
+	var outDialogueExList = make([]subparser.OneDialogueEx, 0)
+	for _, startString := range d.dialogueIndex {
+		outDialogueExList = append(outDialogueExList, *d.dialogueMap[startString])
+	}
+
+	return outDialogueExList
+}
+
+// isFirstLetterIsEngUpper 字符开头的是英文大写的字幕
+func isFirstLetterIsEngUpper(instring string) bool {
+
+	if len(instring) <= 0 {
+		return false
+	}
+
+	if 64 < instring[0] && instring[0] < 91 {
+		return true
+	}
+
+	return false
+}
+
+// isFirstLetterIsEngLower 字符开头的是英文小写的字幕
+func isFirstLetterIsEngLower(instring string) bool {
+
+	if len(instring) <= 0 {
+		return false
+	}
+
+	if 96 < instring[0] && instring[0] < 123 {
+		return true
+	}
+
+	return false
+}

+ 81 - 0
internal/pkg/sub_helper/dialogue_merger_test.go

@@ -0,0 +1,81 @@
+package sub_helper
+
+import (
+	"github.com/allanpk716/ChineseSubFinder/internal/logic/sub_parser/ass"
+	"github.com/allanpk716/ChineseSubFinder/internal/logic/sub_parser/srt"
+	"github.com/allanpk716/ChineseSubFinder/internal/pkg"
+	"github.com/allanpk716/ChineseSubFinder/internal/pkg/sub_parser_hub"
+	"path"
+	"testing"
+)
+
+func Test_isFirstLetterIsEngUpper(t *testing.T) {
+	type args struct {
+		instring string
+	}
+	tests := []struct {
+		name string
+		args args
+		want bool
+	}{
+		{name: "0", args: args{instring: "A"}, want: true},
+		{name: "1", args: args{instring: "a"}, want: false},
+		{name: "2", args: args{instring: "哈"}, want: false},
+		{name: "3", args: args{instring: ""}, want: false},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			if got := isFirstLetterIsEngUpper(tt.args.instring); got != tt.want {
+				t.Errorf("isFirstLetterIsEngUpper() = %v, want %v", got, tt.want)
+			}
+		})
+	}
+}
+
+func Test_isFirstLetterIsEngLower(t *testing.T) {
+	type args struct {
+		instring string
+	}
+	tests := []struct {
+		name string
+		args args
+		want bool
+	}{
+		{name: "0", args: args{instring: "A"}, want: false},
+		{name: "1", args: args{instring: "a"}, want: true},
+		{name: "2", args: args{instring: "哈"}, want: false},
+		{name: "3", args: args{instring: ""}, want: false},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			if got := isFirstLetterIsEngLower(tt.args.instring); got != tt.want {
+				t.Errorf("isFirstLetterIsEngLower() = %v, want %v", got, tt.want)
+			}
+		})
+	}
+}
+
+func TestNewDialogueMerger(t *testing.T) {
+
+	testDataPath := "../../../TestData/FixTimeline"
+	testRootDir, err := pkg.CopyTestData(testDataPath)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	subParserHub := sub_parser_hub.NewSubParserHub(ass.NewParser(), srt.NewParser())
+	bFind, infoBase, err := subParserHub.DetermineFileTypeFromFile(path.Join(testRootDir, "2line-The Card Counter (2021) WEBDL-1080p.chinese(inside).ass"))
+	if err != nil {
+		t.Fatal(err)
+	}
+	if bFind == false {
+		t.Fatal("not find")
+	}
+
+	merger := NewDialogueMerger()
+	for _, ex := range infoBase.DialoguesEx {
+		merger.Add(ex)
+	}
+	newEx := merger.Get()
+	println(len(newEx))
+}

+ 17 - 0
internal/pkg/sub_helper/sub_helper.go

@@ -312,6 +312,23 @@ func DeleteOneSeasonSubCacheFolder(seriesDir string) error {
 	return nil
 }
 
+/*
+	只针对英文字幕进行合并分散的 Dialogues
+	会遇到这样的字幕,如下
+	2line-The Card Counter (2021) WEBDL-1080p.chinese(inside).ass
+	它的对白一句话分了两个 dialogue 去做。这样做后续字幕时间轴校正就会遇到问题,因为只有一半,匹配占比会很低
+	(每一个 Dialogue 的首字母需要分析,大写和小写的占比是多少,统计一下,正常的,和上述特殊的)
+	那么,就需要额外的逻辑去对 DialoguesEx 进行额外的推断
+	暂时考虑的方案是,英文对白每一句的开头应该是英文大写字幕,如果是小写字幕,就应该与上语句合并,且每一句的字符长度有大于一定才触发
+*/
+func MergeMultiDialogue4EngSubtitle(inSubParser *subparser.FileInfo) {
+	merger := NewDialogueMerger()
+	for _, dialogueEx := range inSubParser.DialoguesEx {
+		merger.Add(dialogueEx)
+	}
+	inSubParser.DialoguesEx = merger.Get()
+}
+
 var (
 	regOneSeasonSubFolderNameMatch = regexp.MustCompile(`(?m)^Sub_S\dE0`)
 )

+ 0 - 9
internal/pkg/sub_parser_hub/subParserHub.go

@@ -54,15 +54,6 @@ func (p SubParserHub) DetermineFileTypeFromFile(filePath string) (bool, *subpars
 // DetermineFileTypeFromBytes 确定字幕文件的类型,是双语字幕或者某一种语言等等信息,如果返回 nil ,那么就说明都没有字幕的格式匹配上
 func (p SubParserHub) DetermineFileTypeFromBytes(inBytes []byte, nowExt string) (bool, *subparser.FileInfo, error) {
 
-	/*
-		会遇到这样的字幕,如下
-		2line-The Card Counter (2021) WEBDL-1080p.chinese(inside).ass
-		它的对白一句话分了两个 dialogue 去做。这样做后续字幕时间轴校正就会遇到问题,因为只有一半,匹配占比会很低
-		(每一个 Dialogue 的首字母需要分析,大写和小写的占比是多少,统计一下,正常的,和上述特殊的)
-		那么,就需要额外的逻辑去对 DialoguesEx 进行额外的推断
-		暂时考虑的方案是,英文对白每一句的开头应该是英文大写字幕,如果是小写字幕,就应该与上语句合并,且每一句的字符长度有大于一定才触发
-	*/
-
 	for _, parser := range p.Parser {
 		bFind, subFileInfo, err := parser.DetermineFileTypeFromBytes(inBytes, nowExt)
 		if err != nil {

+ 4 - 1
internal/pkg/sub_timeline_fixer/fixer.go

@@ -130,9 +130,12 @@ func GetOffsetTime(infoBase, infoSrc *subparser.FileInfo, staticLineFileSavePath
 	matchIndexLineCount := len(matchIndexList) * maxCompareDialogue
 	perMatch := float64(matchIndexLineCount) / float64(len(infoSrc.DialoguesEx))
 	if perMatch < 0.1 {
-		log_helper.GetLogger().Debugln("The proportion of matching dialogue is relatively low(< 10%), Skip",
+		log_helper.GetLogger().Debugln("Sequence match 5 dialogues (< 10%), Skip",
 			fmt.Sprintf("%f", perMatch), infoSrc.Name)
 		return 0, nil
+	} else {
+		log_helper.GetLogger().Debugln("Sequence match 5 dialogues:",
+			fmt.Sprintf("%f", perMatch), infoSrc.Name)
 	}
 
 	timeFormat := ""