4 anni fa · 77999e73a6
--- a/go.mod
+++ b/go.mod
@@ -10,6 +10,7 @@ require (
 
				 	github.com/axgle/mahonia v0.0.0-20180208002826-3358181d7394
			
 
				 	github.com/beevik/etree v1.1.0
			
 
				 	github.com/gen2brain/go-unarr v0.1.1
			
 
				+	github.com/go-creed/sat v1.0.3 // indirect
			
 
				 	github.com/go-resty/resty/v2 v2.6.0
			
 
				 	github.com/go-rod/rod v0.97.2
			
 
				 	github.com/jonboulle/clockwork v0.2.2 // indirect
			
@@ -27,3 +28,5 @@ require (
 
				 	github.com/t-tomalak/logrus-easy-formatter v0.0.0-20190827215021-c074f06c5816
			
 
				 	golang.org/x/text v0.3.3
			
 
				 )
			
 
				+
			
 
				+replace github.com/go-creed/sat => github.com/allanpk716/sat v0.0.0-20210622112535-2e00ce54a80b
			
--- a/go.sum
+++ b/go.sum
@@ -27,6 +27,8 @@ github.com/abadojack/whatlanggo v1.0.1 h1:19N6YogDnf71CTHm3Mp2qhYfkRdyvbgwWdd2EP
 
				 github.com/abadojack/whatlanggo v1.0.1/go.mod h1:66WiQbSbJBIlOZMsvbKe5m6pzQovxCH9B/K8tQB2uoc=
			
 
				 github.com/alecthomas/template v0.0.0-20160405071501-a0175ee3bccc/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc=
			
 
				 github.com/alecthomas/units v0.0.0-20151022065526-2efee857e7cf/go.mod h1:ybxpYRFXyAe+OPACYpWeL0wqObRcbAqCMya13uyzqw0=
			
 
				+github.com/allanpk716/sat v0.0.0-20210622112535-2e00ce54a80b h1:vxXQuRgTH0D6aacl7Jyk4Kz8N4wbVTFn7M7Ib+/IFq4=
			
 
				+github.com/allanpk716/sat v0.0.0-20210622112535-2e00ce54a80b/go.mod h1:ZxAhQ0ikMzjqeMbFeoMdCr6es8p10Y87F2nHkqNjSbY=
			
 
				 github.com/andybalholm/brotli v1.0.0 h1:7UCwP93aiSfvWpapti8g88vVVGp2qqtGyePsSuDafo4=
			
 
				 github.com/andybalholm/brotli v1.0.0/go.mod h1:loMXtMfwqflxFJPmdbJO0a3KNoPuLBgiu3qAvBg8x/Y=
			
 
				 github.com/andybalholm/cascadia v1.1.0 h1:BuuO6sSfQNFRu1LppgbD25Hr2vLYW25JvxHs5zzsLTo=
			
--- a/model/decode.go
+++ b/model/decode.go
@@ -36,7 +36,6 @@ func getImdbAndYearMovieXml(movieFilePath string) (common.VideoInfo, error) {
 
				 }
			
 
				 
			
 
				 func getImdbAndYearNfo(nfoFilePath string) (common.VideoInfo, error) {
			
 
				-	// TODO 新增 TVDB ID 的读取
			
 
				 	imdbInfo := common.VideoInfo{}
			
 
				 	doc := etree.NewDocument()
			
 
				 	// 这里会遇到一个梗，下面的关键词，可能是小写、大写、首字母大写
			
--- a/model/language.go
+++ b/model/language.go
@@ -4,6 +4,7 @@ import (
 
				 	"github.com/abadojack/whatlanggo"
			
 
				 	"github.com/allanpk716/ChineseSubFinder/common"
			
 
				 	"github.com/axgle/mahonia"
			
 
				+	"github.com/go-creed/sat"
			
 
				 	"github.com/saintfish/chardet"
			
 
				 	"strings"
			
 
				 )
			
@@ -151,7 +152,8 @@ func IsWhiteListLang(lang whatlanggo.Lang) bool {
 
				 }
			
 
				 
			
 
				 // DetectSubLangAndStatistics 检测语言然后统计
			
 
				-func DetectSubLangAndStatistics(lines []string, langDict map[int]int) {
			
 
				+func DetectSubLangAndStatistics(lines []string, langDict map[int]int, chLines *[]string) {
			
 
				+
			
 
				 	for _, line := range lines {
			
 
				 		info := whatlanggo.DetectWithOptions(line, GetLangOptions())
			
 
				 		tmpLang := -1
			
@@ -167,11 +169,15 @@ func DetectSubLangAndStatistics(lines []string, langDict map[int]int) {
 
				 		} else {
			
 
				 			langDict[tmpLang] = 1
			
 
				 		}
			
 
				+		// 统计中文有多少行
			
 
				+		if info.Lang == whatlanggo.Cmn {
			
 
				+			*chLines = append(*chLines, line)
			
 
				+		}
			
 
				 	}
			
 
				 }
			
 
				 
			
 
				 // SubLangStatistics2SubLangType 由分析的信息转换为具体是什么字幕的语言类型
			
 
				-func SubLangStatistics2SubLangType(countLineFeed, AllLines float32, langDict map[int]int) common.Language {
			
 
				+func SubLangStatistics2SubLangType(countLineFeed, AllLines float32, langDict map[int]int, chLines []string) common.Language {
			
 
				 	const basePer = 0.8
			
 
				 	// 是否是双语？
			
 
				 	isDouble := false
			
@@ -180,7 +186,7 @@ func SubLangStatistics2SubLangType(countLineFeed, AllLines float32, langDict map
 
				 	if perLines > basePer {
			
 
				 		isDouble = true
			
 
				 	}
			
 
				-	// 中文
			
 
				+	// 中文(包含了 chs 以及 cht，这一级是无法区分的，需要额外的简体和繁体区分方法)
			
 
				 	countChinese, hasChinese := langDict[int(whatlanggo.Cmn)]
			
 
				 	// 英文
			
 
				 	countEnglish, hasEnglish := langDict[int(whatlanggo.Eng)]
			
@@ -188,21 +194,37 @@ func SubLangStatistics2SubLangType(countLineFeed, AllLines float32, langDict map
 
				 	countJapanese, hasJapanese := langDict[int(whatlanggo.Jpn)]
			
 
				 	// 韩文
			
 
				 	countKorean, hasKorean := langDict[int(whatlanggo.Kor)]
			
 
				+	// 0 - No , 1 - Chs, 2 - Cht
			
 
				+	isNoOrChsOrCht := 0
			
 
				+	isChsCount := 0
			
 
				+	if hasChinese {
			
 
				+		for _, line := range chLines {
			
 
				+			if chDict.IsChs(line, 0.9) == true {
			
 
				+				isChsCount++
			
 
				+			}
			
 
				+		}
			
 
				+		// 简体句子的占比超过 80%
			
 
				+		if float32(isChsCount) / float32(len(chLines)) > 0.8 {
			
 
				+			isNoOrChsOrCht = 1
			
 
				+		} else {
			
 
				+			isNoOrChsOrCht = 2
			
 
				+		}
			
 
				+	}
			
 
				 
			
 
				 	// 优先判断双语
			
 
				 	if isDouble == true {
			
 
				 		// 首先得在外面统计就知道是双语
			
 
				 		if hasChinese && hasEnglish {
			
 
				 			// 简体	英文
			
 
				-			return common.ChineseSimpleEnglish
			
 
				+			return chIsChsOrCht(common.ChineseSimpleEnglish, isNoOrChsOrCht)
			
 
				 		} else if hasChinese && hasJapanese {
			
 
				 			// 简体 日文
			
 
				-			return common.ChineseSimpleJapanese
			
 
				+			return chIsChsOrCht(common.ChineseSimpleJapanese, isNoOrChsOrCht)
			
 
				 		} else if hasChinese && hasKorean {
			
 
				 			// 简体 韩文
			
 
				-			return common.ChineseSimpleKorean
			
 
				+			return chIsChsOrCht(common.ChineseSimpleKorean, isNoOrChsOrCht)
			
 
				 		} else if hasChinese {
			
 
				-			return common.ChineseSimple
			
 
				+			return chIsChsOrCht(common.ChineseSimple, isNoOrChsOrCht)
			
 
				 		} else if hasEnglish {
			
 
				 			return common.English
			
 
				 		} else if hasJapanese {
			
@@ -219,7 +241,7 @@ func SubLangStatistics2SubLangType(countLineFeed, AllLines float32, langDict map
 
				 			// 那么起码要占比 80% 对吧
			
 
				 			perLines = float32(countChinese) / AllLines
			
 
				 			if perLines > basePer {
			
 
				-				return common.ChineseSimple
			
 
				+				return chIsChsOrCht(common.ChineseSimple, isNoOrChsOrCht)
			
 
				 			}
			
 
				 		}
			
 
				 		if hasEnglish {
			
@@ -249,7 +271,31 @@ func SubLangStatistics2SubLangType(countLineFeed, AllLines float32, langDict map
 
				 
			
 
				 }
			
 
				 
			
 
				-// IsChineseSimpleOrTraditional 从字幕的文件名称中尝试确认是简体还是繁体，不需要判断双语问题，有额外的解析器完成。只可能出现 ChineseSimple ChineseTraditional Unknow 三种情况
			
 
				+// 跟中文相关的再使用，其他的无需传入
			
 
				+func chIsChsOrCht(language common.Language, isNoOrChsOrCht int) common.Language {
			
 
				+	// 输出原来的
			
 
				+	if isNoOrChsOrCht == 0 || isNoOrChsOrCht == 1 {
			
 
				+		return language
			
 
				+	}
			
 
				+	switch language {
			
 
				+	case common.ChineseSimpleEnglish:
			
 
				+		// 简体	英文
			
 
				+		return common.ChineseTraditionalEnglish
			
 
				+	case common.ChineseSimpleJapanese:
			
 
				+		// 简体 日文
			
 
				+		return common.ChineseTraditionalJapanese
			
 
				+	case common.ChineseSimpleKorean:
			
 
				+		// 简体 韩文
			
 
				+		return common.ChineseTraditionalKorean
			
 
				+	case common.ChineseSimple:
			
 
				+		// 简体
			
 
				+		return common.ChineseTraditional
			
 
				+	default:
			
 
				+		return language
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+// IsChineseSimpleOrTraditional 暂时弃用，在 SubLangStatistics2SubLangType 检测语言，通过 unicode 做到。 从字幕的文件名称中尝试确认是简体还是繁体，不需要判断双语问题，有额外的解析器完成。只可能出现 ChineseSimple ChineseTraditional Unknow 三种情况
			
 
				 func IsChineseSimpleOrTraditional(inputFileName string, orgLang common.Language) common.Language {
			
 
				 	// TODO 现在是没有很好的办法去识别是简体还是繁体中文的，所以是依赖判断文件名中的关键词做到的，会有一定的误判
			
 
				 	if strings.Contains(inputFileName, common.SubNameKeywordChineseSimple) || strings.Contains(inputFileName, common.MatchLangChs) {
			
@@ -321,4 +367,8 @@ func FindChineseBestSubtitle(subs []common.SubParserFileInfo) *common.SubParserF
 
				 		}
			
 
				 	}
			
 
				 	return nil
			
 
				-}
			
 
				+}
			
 
				+
			
 
				+var (
			
 
				+	chDict = sat.DefaultDict()
			
 
				+)
			
--- a/model/subParserHub.go
+++ b/model/subParserHub.go
@@ -38,9 +38,9 @@ func (p SubParserHub) DetermineFileTypeFromFile(filePath string) (*common.SubPar
 
				 		} else {
			
 
				 			// 正常至少应该匹配一个吧，不然就是最外层继续返回 nil 出去了
			
 
				 			// 简体和繁体字幕的判断，通过文件名来做到的，基本就算个补判而已
			
 
				-			newLang := IsChineseSimpleOrTraditional(filePath, subFileInfo.Lang)
			
 
				+			//newLang := IsChineseSimpleOrTraditional(filePath, subFileInfo.Lang)
			
 
				 			subFileInfo.Name = filepath.Base(filePath)
			
 
				-			subFileInfo.Lang = newLang
			
 
				+			//subFileInfo.Lang = newLang
			
 
				 			subFileInfo.FileFullPath = filePath
			
 
				 			subFileInfo.FromWhereSite = p.getFromWhereSite(filePath)
			
 
				 			return subFileInfo, nil
			
--- a/sub_parser/ass/ass.go
+++ b/sub_parser/ass/ass.go
@@ -101,11 +101,12 @@ func (p Parser) DetermineFileTypeFromBytes(inBytes []byte, nowExt string) (*comm
 
				 	// 需要判断每一个 Line 是啥语言，[语言的code]次数
			
 
				 	var langDict map[int]int
			
 
				 	langDict = make(map[int]int)
			
 
				+	var chLines = make([]string, 0)
			
 
				 	for _, dialogue := range subFileInfo.Dialogues {
			
 
				-		model.DetectSubLangAndStatistics(dialogue.Lines, langDict)
			
 
				+		model.DetectSubLangAndStatistics(dialogue.Lines, langDict, &chLines)
			
 
				 	}
			
 
				 	// 从统计出来的字典，找出 Top 1 或者 2 的出来，然后计算出是什么语言的字幕
			
 
				-	detectLang := model.SubLangStatistics2SubLangType(float32(countLineFeed), float32(usefullDialogueCount), langDict)
			
 
				+	detectLang := model.SubLangStatistics2SubLangType(float32(countLineFeed), float32(usefullDialogueCount), langDict, chLines)
			
 
				 	subFileInfo.Lang = detectLang
			
 
				 	subFileInfo.Data = inBytes
			
 
				 	return &subFileInfo, nil
			
--- a/sub_parser/ass/ass_test.go
+++ b/sub_parser/ass/ass_test.go
@@ -9,7 +9,9 @@ func TestParser_DetermineFileType(t *testing.T) {
 
				 	//filePath := "C:\\Tmp\\saw9.ass"
			
 
				 	//filePath := "C:\\tmp\\[zimuku]_0_oslo.2021.1080p.web.h264-naisu.简体&英文.ass"
			
 
				 	//filePath := "C:\\tmp\\oslo.2021.1080p.web.h264-naisu.简体&英文.ass"
			
 
				-	filePath := "C:\\Tmp\\Loki - S01E01 - Glorious Purpose WEBDL-1080p Proper.chs[subhd].ass"
			
 
				+	//filePath := "C:\\Tmp\\Loki - S01E01 - Glorious Purpose WEBDL-1080p Proper.chs[subhd].ass"
			
 
				+	//filePath := "C:\\Tmp\\oslo.2021.1080p.web.h264-naisu.繁体&英文.ass"
			
 
				+	filePath := "C:\\Tmp\\oslo.2021.1080p.web.h264-naisu.繁体.ass"
			
 
				 	parser := NewParser()
			
 
				 	sfi, err := parser.DetermineFileTypeFromFile(filePath)
			
 
				 	if err != nil {
			
--- a/sub_parser/srt/srt.go
+++ b/sub_parser/srt/srt.go
@@ -79,11 +79,12 @@ func (p Parser) DetermineFileTypeFromBytes(inBytes []byte, nowExt string) (*comm
 
				 	// 需要判断每一个 Line 是啥语言，[语言的code]次数
			
 
				 	var langDict map[int]int
			
 
				 	langDict = make(map[int]int)
			
 
				+	var chLines = make([]string, 0)
			
 
				 	for _, dialogue := range subFileInfo.Dialogues {
			
 
				-		model.DetectSubLangAndStatistics(dialogue.Lines, langDict)
			
 
				+		model.DetectSubLangAndStatistics(dialogue.Lines, langDict, &chLines)
			
 
				 	}
			
 
				 	// 从统计出来的字典，找出 Top 1 或者 2 的出来，然后计算出是什么语言的字幕
			
 
				-	detectLang := model.SubLangStatistics2SubLangType(float32(countLineFeed), float32(len(matched)), langDict)
			
 
				+	detectLang := model.SubLangStatistics2SubLangType(float32(countLineFeed), float32(len(matched)), langDict, chLines)
			
 
				 	subFileInfo.Lang = detectLang
			
 
				 	subFileInfo.Data = inBytes
			
 
				 	return &subFileInfo, nil