Sfoglia il codice sorgente

替换检测简体繁体的方案

Signed-off-by: allan716 <[email protected]>
allan716 4 anni fa
parent
commit
77999e73a6
8 ha cambiato i file con 76 aggiunte e 18 eliminazioni
  1. 3 0
      go.mod
  2. 2 0
      go.sum
  3. 0 1
      model/decode.go
  4. 60 10
      model/language.go
  5. 2 2
      model/subParserHub.go
  6. 3 2
      sub_parser/ass/ass.go
  7. 3 1
      sub_parser/ass/ass_test.go
  8. 3 2
      sub_parser/srt/srt.go

+ 3 - 0
go.mod

@@ -10,6 +10,7 @@ require (
 	github.com/axgle/mahonia v0.0.0-20180208002826-3358181d7394
 	github.com/beevik/etree v1.1.0
 	github.com/gen2brain/go-unarr v0.1.1
+	github.com/go-creed/sat v1.0.3 // indirect
 	github.com/go-resty/resty/v2 v2.6.0
 	github.com/go-rod/rod v0.97.2
 	github.com/jonboulle/clockwork v0.2.2 // indirect
@@ -27,3 +28,5 @@ require (
 	github.com/t-tomalak/logrus-easy-formatter v0.0.0-20190827215021-c074f06c5816
 	golang.org/x/text v0.3.3
 )
+
+replace github.com/go-creed/sat => github.com/allanpk716/sat v0.0.0-20210622112535-2e00ce54a80b

+ 2 - 0
go.sum

@@ -27,6 +27,8 @@ github.com/abadojack/whatlanggo v1.0.1 h1:19N6YogDnf71CTHm3Mp2qhYfkRdyvbgwWdd2EP
 github.com/abadojack/whatlanggo v1.0.1/go.mod h1:66WiQbSbJBIlOZMsvbKe5m6pzQovxCH9B/K8tQB2uoc=
 github.com/alecthomas/template v0.0.0-20160405071501-a0175ee3bccc/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc=
 github.com/alecthomas/units v0.0.0-20151022065526-2efee857e7cf/go.mod h1:ybxpYRFXyAe+OPACYpWeL0wqObRcbAqCMya13uyzqw0=
+github.com/allanpk716/sat v0.0.0-20210622112535-2e00ce54a80b h1:vxXQuRgTH0D6aacl7Jyk4Kz8N4wbVTFn7M7Ib+/IFq4=
+github.com/allanpk716/sat v0.0.0-20210622112535-2e00ce54a80b/go.mod h1:ZxAhQ0ikMzjqeMbFeoMdCr6es8p10Y87F2nHkqNjSbY=
 github.com/andybalholm/brotli v1.0.0 h1:7UCwP93aiSfvWpapti8g88vVVGp2qqtGyePsSuDafo4=
 github.com/andybalholm/brotli v1.0.0/go.mod h1:loMXtMfwqflxFJPmdbJO0a3KNoPuLBgiu3qAvBg8x/Y=
 github.com/andybalholm/cascadia v1.1.0 h1:BuuO6sSfQNFRu1LppgbD25Hr2vLYW25JvxHs5zzsLTo=

+ 0 - 1
model/decode.go

@@ -36,7 +36,6 @@ func getImdbAndYearMovieXml(movieFilePath string) (common.VideoInfo, error) {
 }
 
 func getImdbAndYearNfo(nfoFilePath string) (common.VideoInfo, error) {
-	// TODO 新增 TVDB ID 的读取
 	imdbInfo := common.VideoInfo{}
 	doc := etree.NewDocument()
 	// 这里会遇到一个梗,下面的关键词,可能是小写、大写、首字母大写

+ 60 - 10
model/language.go

@@ -4,6 +4,7 @@ import (
 	"github.com/abadojack/whatlanggo"
 	"github.com/allanpk716/ChineseSubFinder/common"
 	"github.com/axgle/mahonia"
+	"github.com/go-creed/sat"
 	"github.com/saintfish/chardet"
 	"strings"
 )
@@ -151,7 +152,8 @@ func IsWhiteListLang(lang whatlanggo.Lang) bool {
 }
 
 // DetectSubLangAndStatistics 检测语言然后统计
-func DetectSubLangAndStatistics(lines []string, langDict map[int]int) {
+func DetectSubLangAndStatistics(lines []string, langDict map[int]int, chLines *[]string) {
+
 	for _, line := range lines {
 		info := whatlanggo.DetectWithOptions(line, GetLangOptions())
 		tmpLang := -1
@@ -167,11 +169,15 @@ func DetectSubLangAndStatistics(lines []string, langDict map[int]int) {
 		} else {
 			langDict[tmpLang] = 1
 		}
+		// 统计中文有多少行
+		if info.Lang == whatlanggo.Cmn {
+			*chLines = append(*chLines, line)
+		}
 	}
 }
 
 // SubLangStatistics2SubLangType 由分析的信息转换为具体是什么字幕的语言类型
-func SubLangStatistics2SubLangType(countLineFeed, AllLines float32, langDict map[int]int) common.Language {
+func SubLangStatistics2SubLangType(countLineFeed, AllLines float32, langDict map[int]int, chLines []string) common.Language {
 	const basePer = 0.8
 	// 是否是双语?
 	isDouble := false
@@ -180,7 +186,7 @@ func SubLangStatistics2SubLangType(countLineFeed, AllLines float32, langDict map
 	if perLines > basePer {
 		isDouble = true
 	}
-	// 中文
+	// 中文(包含了 chs 以及 cht,这一级是无法区分的,需要额外的简体和繁体区分方法)
 	countChinese, hasChinese := langDict[int(whatlanggo.Cmn)]
 	// 英文
 	countEnglish, hasEnglish := langDict[int(whatlanggo.Eng)]
@@ -188,21 +194,37 @@ func SubLangStatistics2SubLangType(countLineFeed, AllLines float32, langDict map
 	countJapanese, hasJapanese := langDict[int(whatlanggo.Jpn)]
 	// 韩文
 	countKorean, hasKorean := langDict[int(whatlanggo.Kor)]
+	// 0 - No , 1 - Chs, 2 - Cht
+	isNoOrChsOrCht := 0
+	isChsCount := 0
+	if hasChinese {
+		for _, line := range chLines {
+			if chDict.IsChs(line, 0.9) == true {
+				isChsCount++
+			}
+		}
+		// 简体句子的占比超过 80%
+		if float32(isChsCount) / float32(len(chLines)) > 0.8 {
+			isNoOrChsOrCht = 1
+		} else {
+			isNoOrChsOrCht = 2
+		}
+	}
 
 	// 优先判断双语
 	if isDouble == true {
 		// 首先得在外面统计就知道是双语
 		if hasChinese && hasEnglish {
 			// 简体	英文
-			return common.ChineseSimpleEnglish
+			return chIsChsOrCht(common.ChineseSimpleEnglish, isNoOrChsOrCht)
 		} else if hasChinese && hasJapanese {
 			// 简体 日文
-			return common.ChineseSimpleJapanese
+			return chIsChsOrCht(common.ChineseSimpleJapanese, isNoOrChsOrCht)
 		} else if hasChinese && hasKorean {
 			// 简体 韩文
-			return common.ChineseSimpleKorean
+			return chIsChsOrCht(common.ChineseSimpleKorean, isNoOrChsOrCht)
 		} else if hasChinese {
-			return common.ChineseSimple
+			return chIsChsOrCht(common.ChineseSimple, isNoOrChsOrCht)
 		} else if hasEnglish {
 			return common.English
 		} else if hasJapanese {
@@ -219,7 +241,7 @@ func SubLangStatistics2SubLangType(countLineFeed, AllLines float32, langDict map
 			// 那么起码要占比 80% 对吧
 			perLines = float32(countChinese) / AllLines
 			if perLines > basePer {
-				return common.ChineseSimple
+				return chIsChsOrCht(common.ChineseSimple, isNoOrChsOrCht)
 			}
 		}
 		if hasEnglish {
@@ -249,7 +271,31 @@ func SubLangStatistics2SubLangType(countLineFeed, AllLines float32, langDict map
 
 }
 
-// IsChineseSimpleOrTraditional 从字幕的文件名称中尝试确认是简体还是繁体,不需要判断双语问题,有额外的解析器完成。只可能出现 ChineseSimple ChineseTraditional Unknow 三种情况
+// 跟中文相关的再使用,其他的无需传入
+func chIsChsOrCht(language common.Language, isNoOrChsOrCht int) common.Language {
+	// 输出原来的
+	if isNoOrChsOrCht == 0 || isNoOrChsOrCht == 1 {
+		return language
+	}
+	switch language {
+	case common.ChineseSimpleEnglish:
+		// 简体	英文
+		return common.ChineseTraditionalEnglish
+	case common.ChineseSimpleJapanese:
+		// 简体 日文
+		return common.ChineseTraditionalJapanese
+	case common.ChineseSimpleKorean:
+		// 简体 韩文
+		return common.ChineseTraditionalKorean
+	case common.ChineseSimple:
+		// 简体
+		return common.ChineseTraditional
+	default:
+		return language
+	}
+}
+
+// IsChineseSimpleOrTraditional 暂时弃用,在 SubLangStatistics2SubLangType 检测语言,通过 unicode 做到。 从字幕的文件名称中尝试确认是简体还是繁体,不需要判断双语问题,有额外的解析器完成。只可能出现 ChineseSimple ChineseTraditional Unknow 三种情况
 func IsChineseSimpleOrTraditional(inputFileName string, orgLang common.Language) common.Language {
 	// TODO 现在是没有很好的办法去识别是简体还是繁体中文的,所以是依赖判断文件名中的关键词做到的,会有一定的误判
 	if strings.Contains(inputFileName, common.SubNameKeywordChineseSimple) || strings.Contains(inputFileName, common.MatchLangChs) {
@@ -321,4 +367,8 @@ func FindChineseBestSubtitle(subs []common.SubParserFileInfo) *common.SubParserF
 		}
 	}
 	return nil
-}
+}
+
+var (
+	chDict = sat.DefaultDict()
+)

+ 2 - 2
model/subParserHub.go

@@ -38,9 +38,9 @@ func (p SubParserHub) DetermineFileTypeFromFile(filePath string) (*common.SubPar
 		} else {
 			// 正常至少应该匹配一个吧,不然就是最外层继续返回 nil 出去了
 			// 简体和繁体字幕的判断,通过文件名来做到的,基本就算个补判而已
-			newLang := IsChineseSimpleOrTraditional(filePath, subFileInfo.Lang)
+			//newLang := IsChineseSimpleOrTraditional(filePath, subFileInfo.Lang)
 			subFileInfo.Name = filepath.Base(filePath)
-			subFileInfo.Lang = newLang
+			//subFileInfo.Lang = newLang
 			subFileInfo.FileFullPath = filePath
 			subFileInfo.FromWhereSite = p.getFromWhereSite(filePath)
 			return subFileInfo, nil

+ 3 - 2
sub_parser/ass/ass.go

@@ -101,11 +101,12 @@ func (p Parser) DetermineFileTypeFromBytes(inBytes []byte, nowExt string) (*comm
 	// 需要判断每一个 Line 是啥语言,[语言的code]次数
 	var langDict map[int]int
 	langDict = make(map[int]int)
+	var chLines = make([]string, 0)
 	for _, dialogue := range subFileInfo.Dialogues {
-		model.DetectSubLangAndStatistics(dialogue.Lines, langDict)
+		model.DetectSubLangAndStatistics(dialogue.Lines, langDict, &chLines)
 	}
 	// 从统计出来的字典,找出 Top 1 或者 2 的出来,然后计算出是什么语言的字幕
-	detectLang := model.SubLangStatistics2SubLangType(float32(countLineFeed), float32(usefullDialogueCount), langDict)
+	detectLang := model.SubLangStatistics2SubLangType(float32(countLineFeed), float32(usefullDialogueCount), langDict, chLines)
 	subFileInfo.Lang = detectLang
 	subFileInfo.Data = inBytes
 	return &subFileInfo, nil

+ 3 - 1
sub_parser/ass/ass_test.go

@@ -9,7 +9,9 @@ func TestParser_DetermineFileType(t *testing.T) {
 	//filePath := "C:\\Tmp\\saw9.ass"
 	//filePath := "C:\\tmp\\[zimuku]_0_oslo.2021.1080p.web.h264-naisu.简体&英文.ass"
 	//filePath := "C:\\tmp\\oslo.2021.1080p.web.h264-naisu.简体&英文.ass"
-	filePath := "C:\\Tmp\\Loki - S01E01 - Glorious Purpose WEBDL-1080p Proper.chs[subhd].ass"
+	//filePath := "C:\\Tmp\\Loki - S01E01 - Glorious Purpose WEBDL-1080p Proper.chs[subhd].ass"
+	//filePath := "C:\\Tmp\\oslo.2021.1080p.web.h264-naisu.繁体&英文.ass"
+	filePath := "C:\\Tmp\\oslo.2021.1080p.web.h264-naisu.繁体.ass"
 	parser := NewParser()
 	sfi, err := parser.DetermineFileTypeFromFile(filePath)
 	if err != nil {

+ 3 - 2
sub_parser/srt/srt.go

@@ -79,11 +79,12 @@ func (p Parser) DetermineFileTypeFromBytes(inBytes []byte, nowExt string) (*comm
 	// 需要判断每一个 Line 是啥语言,[语言的code]次数
 	var langDict map[int]int
 	langDict = make(map[int]int)
+	var chLines = make([]string, 0)
 	for _, dialogue := range subFileInfo.Dialogues {
-		model.DetectSubLangAndStatistics(dialogue.Lines, langDict)
+		model.DetectSubLangAndStatistics(dialogue.Lines, langDict, &chLines)
 	}
 	// 从统计出来的字典,找出 Top 1 或者 2 的出来,然后计算出是什么语言的字幕
-	detectLang := model.SubLangStatistics2SubLangType(float32(countLineFeed), float32(len(matched)), langDict)
+	detectLang := model.SubLangStatistics2SubLangType(float32(countLineFeed), float32(len(matched)), langDict, chLines)
 	subFileInfo.Lang = detectLang
 	subFileInfo.Data = inBytes
 	return &subFileInfo, nil