srt.go 4.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122
  1. package srt
  2. import (
  3. "github.com/allanpk716/ChineseSubFinder/internal/pkg/language"
  4. "github.com/allanpk716/ChineseSubFinder/internal/pkg/log_helper"
  5. "github.com/allanpk716/ChineseSubFinder/internal/pkg/regex_things"
  6. "github.com/allanpk716/ChineseSubFinder/internal/types/subparser"
  7. "os"
  8. "path/filepath"
  9. "strings"
  10. )
  11. type Parser struct {
  12. }
  13. func NewParser() *Parser {
  14. return &Parser{}
  15. }
  16. func (p Parser) GetParserName() string {
  17. return "srt"
  18. }
  19. /*
  20. DetermineFileTypeFromFile 确定字幕文件的类型,是双语字幕或者某一种语言等等信息
  21. 当 error 是 common.DetermineFileTypeFromFileExtNotFitSRT
  22. 需要额外的处理逻辑,比如不用报错,而是跳过后续的逻辑
  23. */
  24. func (p Parser) DetermineFileTypeFromFile(filePath string) (bool, *subparser.FileInfo, error) {
  25. nowExt := filepath.Ext(filePath)
  26. log_helper.GetLogger().Debugln("DetermineFileTypeFromFile", p.GetParserName(), filePath)
  27. fBytes, err := os.ReadFile(filePath)
  28. if err != nil {
  29. return false, nil, err
  30. }
  31. inBytes, err := language.ChangeFileCoding2UTF8(fBytes)
  32. if err != nil {
  33. return false, nil, err
  34. }
  35. return p.DetermineFileTypeFromBytes(inBytes, nowExt)
  36. }
  37. // DetermineFileTypeFromBytes 确定字幕文件的类型,是双语字幕或者某一种语言等等信息
  38. func (p Parser) DetermineFileTypeFromBytes(inBytes []byte, nowExt string) (bool, *subparser.FileInfo, error) {
  39. allString := string(inBytes)
  40. // 注意,需要替换掉 \r 不然正则表达式会有问题
  41. allString = strings.ReplaceAll(allString, "\r", "")
  42. // 找到 start end text
  43. matched := regex_things.ReMatchDialogueSRT.FindAllStringSubmatch(allString, -1)
  44. if matched == nil || len(matched) < 1 {
  45. matched = regex_things.ReMatchDialogueSRT2.FindAllStringSubmatch(allString, -1)
  46. if matched == nil || len(matched) < 1 {
  47. log_helper.GetLogger().Debugln("DetermineFileTypeFromBytes can't found DialoguesFilter, Skip")
  48. return false, nil, nil
  49. }
  50. }
  51. subFileInfo := subparser.FileInfo{}
  52. subFileInfo.Content = string(inBytes)
  53. subFileInfo.Ext = nowExt
  54. subFileInfo.Dialogues = make([]subparser.OneDialogue, 0)
  55. subFileInfo.DialoguesFilter = make([]subparser.OneDialogue, 0)
  56. // 这里需要统计一共有几个 \N,以及这个数量在整体行数中的比例,这样就知道是不是双语字幕了
  57. countLineFeed := 0
  58. for _, oneDial := range matched {
  59. startTime := oneDial[2]
  60. endTime := oneDial[3]
  61. nowText := oneDial[4]
  62. odl := subparser.OneDialogue{
  63. StartTime: startTime,
  64. EndTime: endTime,
  65. }
  66. odl.Lines = make([]string, 0)
  67. nowText = strings.TrimRight(nowText, "\n")
  68. texts := strings.Split(nowText, "\n")
  69. for i, text := range texts {
  70. if i == 1 {
  71. // 这样说明有两行字幕,也就是双语啦
  72. countLineFeed++
  73. }
  74. // 剔除 {\fn微软雅黑\fs14}C'mon, Rick. We're -- We're almost there. {} 这一段
  75. text = regex_things.ReMatchBrace.ReplaceAllString(text, "")
  76. text = regex_things.ReMatchBracket.ReplaceAllString(text, "")
  77. text = strings.ReplaceAll(text, `\N`, "")
  78. odl.Lines = append(odl.Lines, text)
  79. }
  80. subFileInfo.Dialogues = append(subFileInfo.Dialogues, odl)
  81. subFileInfo.DialoguesFilter = append(subFileInfo.DialoguesFilter, odl)
  82. }
  83. // 再分析
  84. // 需要判断每一个 Line 是啥语言,[语言的code]次数
  85. var langDict map[int]int
  86. langDict = make(map[int]int)
  87. // 抽取出所有的中文对话
  88. var chLines = make([]string, 0)
  89. // 抽取出所有的第二语言对话
  90. var otherLines = make([]string, 0)
  91. // 抽取出来的对话数组,为了后续用来匹配和修改时间轴
  92. var usefulDialogueExs = make([]subparser.OneDialogueEx, 0)
  93. emptyLines := 0
  94. for _, dialogue := range subFileInfo.DialoguesFilter {
  95. emptyLines += language.DetectSubLangAndStatistics(dialogue, langDict, &usefulDialogueExs, &chLines, &otherLines)
  96. }
  97. // 从统计出来的字典,找出 Top 1 或者 2 的出来,然后计算出是什么语言的字幕
  98. detectLang := language.SubLangStatistics2SubLangType(float32(countLineFeed), float32(len(matched)-emptyLines), langDict, chLines)
  99. subFileInfo.Lang = detectLang
  100. subFileInfo.Data = inBytes
  101. subFileInfo.DialoguesFilterEx = usefulDialogueExs
  102. subFileInfo.CHLines = chLines
  103. subFileInfo.OtherLines = otherLines
  104. return true, &subFileInfo, nil
  105. }
  106. func (p Parser) parseContent(inBytes []byte) {
  107. allString := string(inBytes)
  108. // 注意,需要替换掉 \r 不然正则表达式会有问题
  109. allString = strings.ReplaceAll(allString, "\r", "")
  110. }