Browse Source

修复字幕语言识别问题

Signed-off-by: allan716 <[email protected]>
allan716 4 years ago
parent
commit
adcaa606ba

+ 1 - 0
common/subParserFileInfo.go

@@ -14,5 +14,6 @@ type SubParserFileInfo struct {
 type OneDialogue struct {
 	StartTime string		// 开始时间
 	EndTime string			// 结束时间
+	StyleName	string			// StyleName
 	Lines	[]string		// 台词
 }

+ 14 - 2
downloader_test.go

@@ -2,6 +2,9 @@ package main
 
 import (
 	"github.com/allanpk716/ChineseSubFinder/common"
+	"github.com/allanpk716/ChineseSubFinder/model"
+	"github.com/allanpk716/ChineseSubFinder/sub_parser/ass"
+	"github.com/allanpk716/ChineseSubFinder/sub_parser/srt"
 	"testing"
 )
 
@@ -29,11 +32,11 @@ func TestDownloader_DownloadSub4Movie(t *testing.T) {
 func TestDownloader_DownloadSub4Series(t *testing.T) {
 	var err error
 	//dirRoot := "X:\\连续剧\\隐秘的角落 (2020)"
-	dirRoot := "X:\\连续剧\\The Bad Batch"
+	//dirRoot := "X:\\连续剧\\The Bad Batch"
 	//dirRoot := "X:\\连续剧\\豪斯医生 (2004)"
 	//dirRoot := "X:\\连续剧\\Why Women Kill"
 	//dirRoot := "X:\\连续剧\\Mare of Easttown"
-	//dirRoot := "X:\\连续剧\\瑞克和莫蒂 (2013)"
+	dirRoot := "X:\\连续剧\\瑞克和莫蒂 (2013)"
 	//dirRoot := "X:\\连续剧\\黄石 (2018)"
 	//dirRoot := "X:\\连续剧"
 
@@ -44,4 +47,13 @@ func TestDownloader_DownloadSub4Series(t *testing.T) {
 	if err != nil {
 		t.Fatal(err)
 	}
+}
+
+func TestDownloader_SubParserHub(t *testing.T) {
+	//subFile := "X:\\连续剧\\瑞克和莫蒂 (2013)\\Season 4\\瑞克和莫蒂 - S04E01 - Rick and Morty.chs[zimuku].ass"
+	//subFile := "X:\\连续剧\\瑞克和莫蒂 (2013)\\Season 1\\瑞克和莫蒂 - S01E01 - 试播集.en.ass"
+	subFile := "X:\\连续剧\\瑞克和莫蒂 (2013)\\Season 1\\瑞克和莫蒂 - S01E01 - 试播集.chs_en[zimuku].ass"
+
+	subParserHub := model.NewSubParserHub(ass.NewParser(), srt.NewParser())
+	subParserHub.IsSubHasChinese(subFile)
 }

+ 8 - 0
model/decode.go

@@ -46,6 +46,11 @@ func getImdbAndYearNfo(nfoFilePath string, rootKey string) (common.VideoIMDBInfo
 	if err != nil {
 		return imdbInfo, err
 	}
+	for _, t := range doc.FindElements("./" + rootKey +"/title") {
+		imdbInfo.Title = t.Text()
+		break
+	}
+	//---------------------------------------------------------------------
 	for _, t := range doc.FindElements("./" + rootKey +"/imdb_id") {
 		imdbInfo.ImdbId = t.Text()
 		break
@@ -62,14 +67,17 @@ func getImdbAndYearNfo(nfoFilePath string, rootKey string) (common.VideoIMDBInfo
 		imdbInfo.ImdbId = t.Text()
 		break
 	}
+	//---------------------------------------------------------------------
 	for _, t := range doc.FindElements("./" + rootKey +"/year") {
 		imdbInfo.Year = t.Text()
 		break
 	}
+	//---------------------------------------------------------------------
 	for _, t := range doc.FindElements("./" + rootKey + "/releasedate") {
 		imdbInfo.ReleaseDate = t.Text()
 		break
 	}
+	//---------------------------------------------------------------------
 	for _, t := range doc.FindElements("./" + rootKey + "/premiered") {
 		imdbInfo.ReleaseDate = t.Text()
 		break

+ 1 - 1
model/subParserHub.go

@@ -64,7 +64,7 @@ func (p SubParserHub) IsSubHasChinese(fileFPath string) bool {
 		return false
 	}
 	if HasChineseLang(file.Lang) == false {
-		GetLogger().Warnln("IsSubHasChinese.HasChineseLang", fileFPath, "not chinese sub, is ")
+		GetLogger().Warnln("IsSubHasChinese.HasChineseLang", fileFPath, "not chinese sub, is ", file.Lang.String())
 		return false
 	}
 

+ 38 - 3
sub_parser/ass/ass.go

@@ -6,6 +6,7 @@ import (
 	"io/ioutil"
 	"path/filepath"
 	"regexp"
+	"sort"
 	"strings"
 )
 
@@ -55,18 +56,32 @@ func (p Parser) DetermineFileTypeFromBytes(inBytes []byte, nowExt string) (*comm
 	countLineFeed := 0
 	// 有意义的对话统计数,排除 Style 类型
 	usefullDialogueCount := 0
+	// 先进行字幕 StyleName 的出现次数排序,找到最多的,就是常规字幕的,不是特效的
+	var nameMap = make(map[string]int)
+	for _, oneLine := range matched {
+		nowStyleName := oneLine[3]
+		_, ok := nameMap[nowStyleName]
+		if ok == false {
+			nameMap[nowStyleName] = 1
+		} else {
+			nameMap[nowStyleName]++
+		}
+	}
+	mapByValue := sortMapByValue(nameMap)
 	// 先读取一次字幕文件
 	for _, oneLine := range matched {
 		// 排除特效内容,只统计有意义的对话部分
-		if strings.Contains(oneLine[0], "Default") == false {
+		if strings.Contains(oneLine[0], mapByValue[0].Name) == false {
 			continue
 		}
 		usefullDialogueCount++
 
 		startTime := oneLine[1]
 		endTime := oneLine[2]
-		nowText := oneLine[3]
+		nowStyleName := oneLine[3]
+		nowText := oneLine[4]
 		odl := common.OneDialogue{
+			StyleName: nowStyleName,
 			StartTime: startTime,
 			EndTime: endTime,
 		}
@@ -114,7 +129,27 @@ func (p Parser) DetermineFileTypeFromBytes(inBytes []byte, nowExt string) (*comm
 
 const (
 	// 字幕文件对话的每一行
-	regString = `Dialogue: [^,.]*[0-9]*,([1-9]?[0-9]*:[0-9]*:[0-9]*.[0-9]*),([1-9]?[0-9]*:[0-9]*:[0-9]*.[0-9]*),[^,.]*,[^,.]*,[0-9]*,[0-9]*,[0-9]*,[^,.]*,(.*)`
+	//regString = `Dialogue: [^,.]*[0-9]*,([1-9]?[0-9]*:[0-9]*:[0-9]*.[0-9]*),([1-9]?[0-9]*:[0-9]*:[0-9]*.[0-9]*),[^,.]*,[^,.]*,[0-9]*,[0-9]*,[0-9]*,[^,.]*,(.*)`
+	regString = `Dialogue: [^,.]*[0-9]*,([1-9]?[0-9]*:[0-9]*:[0-9]*.[0-9]*),([1-9]?[0-9]*:[0-9]*:[0-9]*.[0-9]*),([^,.]*),[^,.]*,[0-9]*,[0-9]*,[0-9]*,[^,.]*,(.*)`
 	// 匹配 ass 文件中的 Style 变量
 	regString4Style = `(?m)^Style:\s*(\w+),`
 )
+
+type StyleNameInfo struct {
+	Name string
+	Count  int
+}
+type StyleNameInfos []StyleNameInfo
+func (a StyleNameInfos) Len() int           { return len(a) }
+func (a StyleNameInfos) Less(i, j int) bool { return a[i].Count < a[j].Count }
+func (a StyleNameInfos) Swap(i, j int)      { a[i], a[j] = a[j], a[i] }
+func sortMapByValue(m map[string]int) StyleNameInfos {
+	p := make(StyleNameInfos, len(m))
+	i := 0
+	for k, v := range m {
+		p[i] = StyleNameInfo{k, v}
+		i++
+	}
+	sort.Sort(sort.Reverse(p))
+	return p
+}

+ 2 - 1
sub_supplier/subhd/subhd_test.go

@@ -32,7 +32,8 @@ func TestSupplier_GetSubListFromFile(t *testing.T) {
 
 func TestSupplier_GetSubListFromFile4Series(t *testing.T) {
 
-	ser := "X:\\连续剧\\The Bad Batch"	// tt12708542
+	//ser := "X:\\连续剧\\The Bad Batch"	// tt12708542
+	ser := "X:\\连续剧\\瑞克和莫蒂 (2013)"	//
 	//ser := "X:\\连续剧\\杀死伊芙 (2018)"	// tt7016936
 	//ser := "X:\\连续剧\\Money.Heist"
 	//ser := "X:\\连续剧\\黑钱胜地 (2017)"