| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122 | 
							- package srt
 
- import (
 
- 	"github.com/allanpk716/ChineseSubFinder/internal/pkg/language"
 
- 	"github.com/allanpk716/ChineseSubFinder/internal/pkg/log_helper"
 
- 	"github.com/allanpk716/ChineseSubFinder/internal/pkg/regex_things"
 
- 	"github.com/allanpk716/ChineseSubFinder/internal/types/subparser"
 
- 	"os"
 
- 	"path/filepath"
 
- 	"strings"
 
- )
 
- type Parser struct {
 
- }
 
- func NewParser() *Parser {
 
- 	return &Parser{}
 
- }
 
- func (p Parser) GetParserName() string {
 
- 	return "srt"
 
- }
 
- /*
 
- 	DetermineFileTypeFromFile 确定字幕文件的类型,是双语字幕或者某一种语言等等信息
 
- 	当 error 是 common.DetermineFileTypeFromFileExtNotFitSRT
 
- 	需要额外的处理逻辑,比如不用报错,而是跳过后续的逻辑
 
- */
 
- func (p Parser) DetermineFileTypeFromFile(filePath string) (bool, *subparser.FileInfo, error) {
 
- 	nowExt := filepath.Ext(filePath)
 
- 	log_helper.GetLogger().Debugln("DetermineFileTypeFromFile", p.GetParserName(), filePath)
 
- 	fBytes, err := os.ReadFile(filePath)
 
- 	if err != nil {
 
- 		return false, nil, err
 
- 	}
 
- 	inBytes, err := language.ChangeFileCoding2UTF8(fBytes)
 
- 	if err != nil {
 
- 		return false, nil, err
 
- 	}
 
- 	return p.DetermineFileTypeFromBytes(inBytes, nowExt)
 
- }
 
- // DetermineFileTypeFromBytes 确定字幕文件的类型,是双语字幕或者某一种语言等等信息
 
- func (p Parser) DetermineFileTypeFromBytes(inBytes []byte, nowExt string) (bool, *subparser.FileInfo, error) {
 
- 	allString := string(inBytes)
 
- 	// 注意,需要替换掉 \r 不然正则表达式会有问题
 
- 	allString = strings.ReplaceAll(allString, "\r", "")
 
- 	// 找到 start end text
 
- 	matched := regex_things.ReMatchDialogueSRT.FindAllStringSubmatch(allString, -1)
 
- 	if matched == nil || len(matched) < 1 {
 
- 		matched = regex_things.ReMatchDialogueSRT2.FindAllStringSubmatch(allString, -1)
 
- 		if matched == nil || len(matched) < 1 {
 
- 			log_helper.GetLogger().Debugln("DetermineFileTypeFromBytes can't found DialoguesFilter, Skip")
 
- 			return false, nil, nil
 
- 		}
 
- 	}
 
- 	subFileInfo := subparser.FileInfo{}
 
- 	subFileInfo.Content = string(inBytes)
 
- 	subFileInfo.Ext = nowExt
 
- 	subFileInfo.Dialogues = make([]subparser.OneDialogue, 0)
 
- 	subFileInfo.DialoguesFilter = make([]subparser.OneDialogue, 0)
 
- 	// 这里需要统计一共有几个 \N,以及这个数量在整体行数中的比例,这样就知道是不是双语字幕了
 
- 	countLineFeed := 0
 
- 	for _, oneDial := range matched {
 
- 		startTime := oneDial[2]
 
- 		endTime := oneDial[3]
 
- 		nowText := oneDial[4]
 
- 		odl := subparser.OneDialogue{
 
- 			StartTime: startTime,
 
- 			EndTime:   endTime,
 
- 		}
 
- 		odl.Lines = make([]string, 0)
 
- 		nowText = strings.TrimRight(nowText, "\n")
 
- 		texts := strings.Split(nowText, "\n")
 
- 		for i, text := range texts {
 
- 			if i == 1 {
 
- 				// 这样说明有两行字幕,也就是双语啦
 
- 				countLineFeed++
 
- 			}
 
- 			// 剔除 {\fn微软雅黑\fs14}C'mon, Rick. We're -- We're almost there. {} 这一段
 
- 			text = regex_things.ReMatchBrace.ReplaceAllString(text, "")
 
- 			text = regex_things.ReMatchBracket.ReplaceAllString(text, "")
 
- 			text = strings.ReplaceAll(text, `\N`, "")
 
- 			odl.Lines = append(odl.Lines, text)
 
- 		}
 
- 		subFileInfo.Dialogues = append(subFileInfo.Dialogues, odl)
 
- 		subFileInfo.DialoguesFilter = append(subFileInfo.DialoguesFilter, odl)
 
- 	}
 
- 	// 再分析
 
- 	// 需要判断每一个 Line 是啥语言,[语言的code]次数
 
- 	var langDict map[int]int
 
- 	langDict = make(map[int]int)
 
- 	// 抽取出所有的中文对话
 
- 	var chLines = make([]string, 0)
 
- 	// 抽取出所有的第二语言对话
 
- 	var otherLines = make([]string, 0)
 
- 	// 抽取出来的对话数组,为了后续用来匹配和修改时间轴
 
- 	var usefulDialogueExs = make([]subparser.OneDialogueEx, 0)
 
- 	emptyLines := 0
 
- 	for _, dialogue := range subFileInfo.DialoguesFilter {
 
- 		emptyLines += language.DetectSubLangAndStatistics(dialogue, langDict, &usefulDialogueExs, &chLines, &otherLines)
 
- 	}
 
- 	// 从统计出来的字典,找出 Top 1 或者 2 的出来,然后计算出是什么语言的字幕
 
- 	detectLang := language.SubLangStatistics2SubLangType(float32(countLineFeed), float32(len(matched)-emptyLines), langDict, chLines)
 
- 	subFileInfo.Lang = detectLang
 
- 	subFileInfo.Data = inBytes
 
- 	subFileInfo.DialoguesFilterEx = usefulDialogueExs
 
- 	subFileInfo.CHLines = chLines
 
- 	subFileInfo.OtherLines = otherLines
 
- 	return true, &subFileInfo, nil
 
- }
 
- func (p Parser) parseContent(inBytes []byte) {
 
- 	allString := string(inBytes)
 
- 	// 注意,需要替换掉 \r 不然正则表达式会有问题
 
- 	allString = strings.ReplaceAll(allString, "\r", "")
 
- }
 
 
  |