Browse Source

正在添加字幕的语言检测

Signed-off-by: allan716 <[email protected]>
allan716 4 years ago
parent
commit
9b37530879
9 changed files with 210 additions and 13 deletions
  1. 42 0
      common/subType.go
  2. 0 13
      common/util.go
  3. 1 0
      go.mod
  4. 2 0
      go.sum
  5. 113 0
      sub_parser/ass/ass.go
  6. 15 0
      sub_parser/ass/ass_test.go
  7. 7 0
      sub_parser/iSubParser.go
  8. 16 0
      sub_parser/srt/srt.go
  9. 14 0
      sub_parser/subFileInfo.go

+ 42 - 0
common/subType.go

@@ -0,0 +1,42 @@
+package common
+
+import (
+	"path/filepath"
+	"strings"
+)
+
+// IsSubTypeWanted 这里匹配的字幕的格式,不包含 Ext 的 . 小数点,注意,仅仅是包含关系
+func IsSubTypeWanted(subName string) bool {
+	if strings.Contains(strings.ToLower(subName), SubTypeASS) ||
+		strings.Contains(strings.ToLower(subName), SubTypeSSA) ||
+		strings.Contains(strings.ToLower(subName), SubTypeSRT) {
+		return true
+	}
+
+	return false
+}
+
+// IsSubExtWanted 输入的字幕文件名,判断后缀名是否符合期望的字幕后缀名列表
+func IsSubExtWanted(subName string) bool {
+	inExt := filepath.Ext(subName)
+	switch inExt {
+	case SubExtSSA:
+	case SubExtASS:
+	case SubExtSRT:
+		return true
+	default:
+		return false
+	}
+
+	return false
+}
+
+const (
+	SubTypeASS = "ass"
+	SubTypeSSA = "ssa"
+	SubTypeSRT = "srt"
+
+	SubExtASS = ".ass"
+	SubExtSSA = ".ssa"
+	SubExtSRT = ".srt"
+)

+ 0 - 13
common/util.go

@@ -98,19 +98,6 @@ func GetDebugFolder() (string, error) {
 	return nowProcessRoot, err
 }
 
-func IsSubTypeWanted(subName string) bool {
-	const subTypeASS = "ass"
-	const subTypeSSA = "ssa"
-	const subTypeSRT = "srt"
-	if strings.Contains(strings.ToLower(subName), subTypeASS) ||
-		strings.Contains(strings.ToLower(subName), subTypeSSA) ||
-		strings.Contains(strings.ToLower(subName), subTypeSRT) {
-		return true
-	}
-
-	return false
-}
-
 // ReqParam 可选择传入的参数
 type ReqParam struct {
 	UserExtList []string	// 用户确认的视频后缀名支持列表

+ 1 - 0
go.mod

@@ -4,6 +4,7 @@ go 1.15
 
 require (
 	github.com/PuerkitoBio/goquery v1.6.1
+	github.com/abadojack/whatlanggo v1.0.1 // indirect
 	github.com/beevik/etree v1.1.0
 	github.com/go-resty/resty/v2 v2.6.0
 	github.com/go-rod/rod v0.97.2

+ 2 - 0
go.sum

@@ -1,5 +1,7 @@
 github.com/PuerkitoBio/goquery v1.6.1 h1:FgjbQZKl5HTmcn4sKBgvx8vv63nhyhIpv7lJpFGCWpk=
 github.com/PuerkitoBio/goquery v1.6.1/go.mod h1:GsLWisAFVj4WgDibEWF4pvYnkVQBpKBKeU+7zCJoLcc=
+github.com/abadojack/whatlanggo v1.0.1 h1:19N6YogDnf71CTHm3Mp2qhYfkRdyvbgwWdd2EPxJRG4=
+github.com/abadojack/whatlanggo v1.0.1/go.mod h1:66WiQbSbJBIlOZMsvbKe5m6pzQovxCH9B/K8tQB2uoc=
 github.com/andybalholm/cascadia v1.1.0 h1:BuuO6sSfQNFRu1LppgbD25Hr2vLYW25JvxHs5zzsLTo=
 github.com/andybalholm/cascadia v1.1.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y=
 github.com/beevik/etree v1.1.0 h1:T0xke/WvNtMoCqgzPhkX2r4rjY3GDZFi+FjpRZY2Jbs=

+ 113 - 0
sub_parser/ass/ass.go

@@ -0,0 +1,113 @@
+package ass
+
+import (
+	"github.com/abadojack/whatlanggo"
+	"github.com/allanpk716/ChineseSubFinder/common"
+	"github.com/allanpk716/ChineseSubFinder/sub_parser"
+	"io/ioutil"
+	"path/filepath"
+	"regexp"
+	"strings"
+)
+
+type Parser struct {
+	langOptions whatlanggo.Options 	// Whitelist
+}
+
+func NewParser() *Parser {
+	p := Parser{}
+	p.langOptions = whatlanggo.Options{
+		Whitelist: map[whatlanggo.Lang]bool{
+			whatlanggo.Cmn: true,	// 中文
+			whatlanggo.Eng: true,	// 英文
+			whatlanggo.Jpn: true,	// 日文
+			whatlanggo.Kor: true,	// 韩文
+		},
+	}
+	return &Parser{}
+}
+
+// DetermineFileType 确定字幕文件的类型,是双语字幕或者某一种语言等等信息
+func (p Parser) DetermineFileType(filePath string) (common.Language, *sub_parser.SubFileInfo, error) {
+	nowExt := filepath.Ext(filePath)
+	if strings.ToLower(nowExt) != common.SubExtASS && strings.ToLower(nowExt) != common.SubExtSSA {
+		return common.Unknow, nil ,nil
+	}
+	fBytes, err := ioutil.ReadFile(filePath)
+	if err != nil {
+		return common.Unknow, nil ,err
+	}
+	re := regexp.MustCompile(regString)
+	// 找到 start end text
+	matched := re.FindAllStringSubmatch(string(fBytes), -1)
+	if len(matched) < 1 {
+		return common.Unknow, nil ,nil
+	}
+	subFileInfo := sub_parser.SubFileInfo{}
+	subFileInfo.Ext = nowExt
+	subFileInfo.Dialogues = make([]sub_parser.OneDialogue, 0)
+	// 这里需要统计一共有几个 \N,以及这个数量在整体行数中的比例,这样就知道是不是双语字幕了
+	countLineFeed := 0
+	// 先读取一次字幕文件
+	for _, oneLine := range matched {
+		startTime := oneLine[1]
+		endTime := oneLine[2]
+		nowText := oneLine[3]
+		odl := sub_parser.OneDialogue{
+			StartTime: startTime,
+			EndTime: endTime,
+		}
+		odl.Lines = make([]string, 0)
+		// nowText 优先移除 \h 这个是替换空格, \h 是让两个词在一行,不换行显示
+		nowText = strings.ReplaceAll(nowText, `\h` , " ")
+		// nowText 这个需要先把 {} 花括号内的内容给移除
+		var re = regexp.MustCompile(`(?i){.*}`)
+		nowText1 := re.ReplaceAllString(nowText, "")
+		nowText1 = strings.TrimRight(nowText1, "\r")
+		// 然后判断是否有 \N 或者 \n
+		// 直接把 \n 替换为 \N 来解析
+		nowText1 = strings.ReplaceAll(nowText1, `\n` , `\N`)
+		if strings.Contains(nowText1,`\N`) {
+			// 有,那么就需要再次切割,一般是双语字幕
+			var re2 = regexp.MustCompile(`(?i)(.*)\\N(.*)`)
+			for _, matched2 := range re2.FindAllStringSubmatch(nowText1, -1) {
+				for i, s := range matched2 {
+					if i == 0 {continue}
+					odl.Lines = append(odl.Lines, s)
+				}
+			}
+			countLineFeed++
+		} else {
+			// 无,则可以直接添加
+			odl.Lines = append(odl.Lines, nowText1)
+		}
+
+		subFileInfo.Dialogues = append(subFileInfo.Dialogues, odl)
+	}
+	// 再分析
+	// 是不是双语字幕,定义,超过 80% 就一定是了(不可能三语吧···)
+	isDouble := false
+	perLines := float32(countLineFeed) / float32(len(matched))
+	if perLines > 0.8 {
+		isDouble = true
+	}
+	println(isDouble)
+	// 需要判断每一个 Line 是啥语言
+	for _, dialogue := range subFileInfo.Dialogues {
+		for i, line := range dialogue.Lines {
+			println(line)
+			info := whatlanggo.DetectWithOptions(line, p.langOptions)
+			// 补是语言是 info.Lang -1
+			println(i, "Language:", info.Lang, info.Lang.String())
+		}
+	}
+
+
+	return common.Unknow, &subFileInfo ,nil
+}
+
+
+const (
+	// 字幕文件对话的每一行
+	regString = `Dialogue: [^,.]*[0-9]*,([1-9]?[0-9]*:[0-9]*:[0-9]*.[0-9]*),([1-9]?[0-9]*:[0-9]*:[0-9]*.[0-9]*),[^,.]*,[^,.]*,[0-9]*,[0-9]*,[0-9]*,[^,.]*,(.*)`
+)

+ 15 - 0
sub_parser/ass/ass_test.go

@@ -0,0 +1,15 @@
+package ass
+
+import (
+	"testing"
+)
+
+func TestParser_DetermineFileType(t *testing.T) {
+
+	filePath := "C:\\Tmp\\saw9.ass"
+	parser := NewParser()
+	_, _, err := parser.DetermineFileType(filePath)
+	if err != nil {
+		t.Fatal(err)
+	}
+}

+ 7 - 0
sub_parser/iSubParser.go

@@ -0,0 +1,7 @@
+package sub_parser
+
+import "github.com/allanpk716/ChineseSubFinder/common"
+
+type ISubParser interface {
+	DetermineFileType(filePath string) (common.Language, *SubFileInfo, error)
+}

+ 16 - 0
sub_parser/srt/srt.go

@@ -0,0 +1,16 @@
+package srt
+
+import (
+	"github.com/allanpk716/ChineseSubFinder/common"
+	"github.com/allanpk716/ChineseSubFinder/sub_parser"
+)
+
+type Parser struct {
+
+}
+
+func (a Parser) DetermineFileType(filePath string) (common.Language, *sub_parser.SubFileInfo, error) {
+	panic("implement me")
+}
+
+const regString = `(\d+)\n([\d:,]+)\s+-{2}\>\s+([\d:,]+)\n([\s\S]*?(\n{2}|$))`

+ 14 - 0
sub_parser/subFileInfo.go

@@ -0,0 +1,14 @@
+package sub_parser
+
+type SubFileInfo struct {
+	Name	string			// 字幕的名称
+	Ext		string			// 字幕的后缀名
+	Dialogues []OneDialogue	// 整个字幕文件的所有对话
+}
+
+// OneDialogue 一句对话
+type OneDialogue struct {
+	StartTime string		// 开始时间
+	EndTime string			// 结束时间
+	Lines	[]string		// 台词
+}