srt.go 2.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990
  1. package srt
  2. import (
  3. "github.com/allanpk716/ChineseSubFinder/common"
  4. "github.com/allanpk716/ChineseSubFinder/sub_parser"
  5. "io/ioutil"
  6. "path/filepath"
  7. "regexp"
  8. "strings"
  9. )
  10. type Parser struct {
  11. }
  12. func NewParser() *Parser {
  13. return &Parser{}
  14. }
  15. // DetermineFileTypeFromFile 确定字幕文件的类型,是双语字幕或者某一种语言等等信息
  16. func (p Parser) DetermineFileTypeFromFile(filePath string) (*sub_parser.SubFileInfo, error) {
  17. nowExt := filepath.Ext(filePath)
  18. if strings.ToLower(nowExt) != common.SubExtSRT {
  19. return nil ,nil
  20. }
  21. fBytes, err := ioutil.ReadFile(filePath)
  22. if err != nil {
  23. return nil ,err
  24. }
  25. return p.DetermineFileTypeFromBytes(fBytes, nowExt)
  26. }
  27. // DetermineFileTypeFromBytes 确定字幕文件的类型,是双语字幕或者某一种语言等等信息
  28. func (p Parser) DetermineFileTypeFromBytes(inBytes []byte, nowExt string) (*sub_parser.SubFileInfo, error){
  29. allString := string(inBytes)
  30. // 注意,需要替换掉 \r 不然正则表达式会有问题
  31. allString = strings.ReplaceAll(allString, "\r", "")
  32. re := regexp.MustCompile(regString)
  33. // 找到 start end text
  34. matched := re.FindAllStringSubmatch(allString, -1)
  35. if len(matched) < 1 {
  36. return nil ,nil
  37. }
  38. subFileInfo := sub_parser.SubFileInfo{}
  39. subFileInfo.Ext = nowExt
  40. subFileInfo.Dialogues = make([]sub_parser.OneDialogue, 0)
  41. // 这里需要统计一共有几个 \N,以及这个数量在整体行数中的比例,这样就知道是不是双语字幕了
  42. countLineFeed := 0
  43. for _, oneDial := range matched {
  44. startTime := oneDial[2]
  45. endTime := oneDial[3]
  46. nowText := oneDial[4]
  47. odl := sub_parser.OneDialogue{
  48. StartTime: startTime,
  49. EndTime: endTime,
  50. }
  51. odl.Lines = make([]string, 0)
  52. nowText = strings.TrimRight(nowText, "\n")
  53. texts := strings.Split(nowText, "\n")
  54. for i, text := range texts {
  55. if i == 1 {
  56. // 这样说明有两行字幕,也就是双语啦
  57. countLineFeed++
  58. }
  59. odl.Lines = append(odl.Lines, text)
  60. }
  61. subFileInfo.Dialogues = append(subFileInfo.Dialogues, odl)
  62. }
  63. // 再分析
  64. // 是不是双语字幕,定义,超过 80% 就一定是了(不可能三语吧···)
  65. isDouble := false
  66. perLines := float32(countLineFeed) / float32(len(matched))
  67. if perLines > 0.8 {
  68. isDouble = true
  69. }
  70. // 需要判断每一个 Line 是啥语言,[语言的code]次数
  71. var langDict map[int]int
  72. langDict = make(map[int]int)
  73. for _, dialogue := range subFileInfo.Dialogues {
  74. common.DetectSubLangAndStatistics(dialogue.Lines, langDict)
  75. }
  76. // 从统计出来的字典,找出 Top 1 或者 2 的出来,然后计算出是什么语言的字幕
  77. detectLang := common.SubLangStatistics2SubLangType(isDouble, langDict)
  78. subFileInfo.Lang = detectLang
  79. return &subFileInfo, nil
  80. }
  81. const regString = `(\d+)\n([\d:,]+)\s+-{2}\>\s+([\d:,]+)\n([\s\S]*?(\n{2}|$))`