ass.go 5.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168
  1. package ass
  2. import (
  3. "github.com/allanpk716/ChineseSubFinder/internal/common"
  4. "github.com/allanpk716/ChineseSubFinder/internal/pkg/language"
  5. "github.com/allanpk716/ChineseSubFinder/internal/types/subparser"
  6. "io/ioutil"
  7. "path/filepath"
  8. "regexp"
  9. "sort"
  10. "strings"
  11. )
  12. type Parser struct {
  13. }
  14. func NewParser() *Parser {
  15. return &Parser{}
  16. }
  17. func (p Parser) GetParserName() string {
  18. return "ass"
  19. }
  20. /*
  21. DetermineFileTypeFromFile 确定字幕文件的类型,是双语字幕或者某一种语言等等信息
  22. 当 error 是 common.DetermineFileTypeFromFileExtNotFitASSorSSA
  23. 需要额外的处理逻辑,比如不用报错,而是跳过后续的逻辑
  24. */
  25. func (p Parser) DetermineFileTypeFromFile(filePath string) (bool, *subparser.FileInfo, error) {
  26. nowExt := filepath.Ext(filePath)
  27. if strings.ToLower(nowExt) != common.SubExtASS && strings.ToLower(nowExt) != common.SubExtSSA {
  28. return false, nil, nil
  29. }
  30. fBytes, err := ioutil.ReadFile(filePath)
  31. if err != nil {
  32. return false, nil, err
  33. }
  34. inBytes, err := language.ChangeFileCoding2UTF8(fBytes)
  35. if err != nil {
  36. return false, nil, err
  37. }
  38. return p.DetermineFileTypeFromBytes(inBytes, nowExt)
  39. }
  40. // DetermineFileTypeFromBytes 确定字幕文件的类型,是双语字幕或者某一种语言等等信息
  41. func (p Parser) DetermineFileTypeFromBytes(inBytes []byte, nowExt string) (bool, *subparser.FileInfo, error) {
  42. allString := string(inBytes)
  43. // 注意,需要替换掉 \r 不然正则表达式会有问题
  44. allString = strings.ReplaceAll(allString, "\r", "")
  45. re := regexp.MustCompile(regString)
  46. // 找到 start end text
  47. matched := re.FindAllStringSubmatch(allString, -1)
  48. if len(matched) < 1 {
  49. return false, nil, nil
  50. }
  51. subFileInfo := subparser.FileInfo{}
  52. subFileInfo.Ext = nowExt
  53. subFileInfo.Dialogues = make([]subparser.OneDialogue, 0)
  54. // 这里需要统计一共有几个 \N,以及这个数量在整体行数中的比例,这样就知道是不是双语字幕了
  55. countLineFeed := 0
  56. // 有意义的对话统计数,排除 Style 类型
  57. usefullDialogueCount := 0
  58. // 先进行字幕 StyleName 的出现次数排序,找到最多的,就是常规字幕的,不是特效的
  59. var nameMap = make(map[string]int)
  60. for _, oneLine := range matched {
  61. nowStyleName := oneLine[3]
  62. _, ok := nameMap[nowStyleName]
  63. if ok == false {
  64. nameMap[nowStyleName] = 1
  65. } else {
  66. nameMap[nowStyleName]++
  67. }
  68. }
  69. mapByValue := sortMapByValue(nameMap)
  70. // 先读取一次字幕文件
  71. for _, oneLine := range matched {
  72. // 排除特效内容,只统计有意义的对话部分
  73. if strings.Contains(oneLine[0], mapByValue[0].Name) == false {
  74. continue
  75. }
  76. usefullDialogueCount++
  77. startTime := oneLine[1]
  78. endTime := oneLine[2]
  79. nowStyleName := oneLine[3]
  80. nowText := oneLine[4]
  81. odl := subparser.OneDialogue{
  82. StyleName: nowStyleName,
  83. StartTime: startTime,
  84. EndTime: endTime,
  85. }
  86. odl.Lines = make([]string, 0)
  87. // nowText 优先移除 \h 这个是替换空格, \h 是让两个词在一行,不换行显示
  88. nowText = strings.ReplaceAll(nowText, `\h`, " ")
  89. // nowText 这个需要先把 {} 花括号内的内容给移除
  90. var re = regexp.MustCompile(`(?m)((?i){[^}]*})`)
  91. nowText1 := re.ReplaceAllString(nowText, "")
  92. nowText1 = strings.TrimRight(nowText1, "\r")
  93. // 然后判断是否有 \N 或者 \n
  94. // 直接把 \n 替换为 \N 来解析
  95. nowText1 = strings.ReplaceAll(nowText1, `\n`, `\N`)
  96. if strings.Contains(nowText1, `\N`) {
  97. // 有,那么就需要再次切割,一般是双语字幕
  98. var re2 = regexp.MustCompile(`(?i)(.*)\\N(.*)`)
  99. for _, matched2 := range re2.FindAllStringSubmatch(nowText1, -1) {
  100. for i, s := range matched2 {
  101. if i == 0 {
  102. continue
  103. }
  104. odl.Lines = append(odl.Lines, s)
  105. }
  106. }
  107. countLineFeed++
  108. } else {
  109. // 无,则可以直接添加
  110. odl.Lines = append(odl.Lines, nowText1)
  111. }
  112. subFileInfo.Dialogues = append(subFileInfo.Dialogues, odl)
  113. }
  114. // 再分析
  115. // 需要判断每一个 Line 是啥语言,[语言的code]次数
  116. var langDict map[int]int
  117. langDict = make(map[int]int)
  118. // 抽取出所有的中文对话
  119. var chLines = make([]string, 0)
  120. // 抽取出所有的第二语言对话
  121. var otherLines = make([]string, 0)
  122. for _, dialogue := range subFileInfo.Dialogues {
  123. language.DetectSubLangAndStatistics(dialogue.Lines, langDict, &chLines, &otherLines)
  124. }
  125. // 从统计出来的字典,找出 Top 1 或者 2 的出来,然后计算出是什么语言的字幕
  126. detectLang := language.SubLangStatistics2SubLangType(float32(countLineFeed), float32(usefullDialogueCount), langDict, chLines)
  127. subFileInfo.Lang = detectLang
  128. subFileInfo.Data = inBytes
  129. subFileInfo.CHLines = chLines
  130. subFileInfo.OtherLines = otherLines
  131. return true, &subFileInfo, nil
  132. }
  133. const (
  134. // 字幕文件对话的每一行
  135. //regString = `Dialogue: [^,.]*[0-9]*,([1-9]?[0-9]*:[0-9]*:[0-9]*.[0-9]*),([1-9]?[0-9]*:[0-9]*:[0-9]*.[0-9]*),[^,.]*,[^,.]*,[0-9]*,[0-9]*,[0-9]*,[^,.]*,(.*)`
  136. regString = `Dialogue: [^,.]*[0-9]*,([1-9]?[0-9]*:[0-9]*:[0-9]*.[0-9]*),([1-9]?[0-9]*:[0-9]*:[0-9]*.[0-9]*),([^,.]*),[^,.]*,[0-9]*,[0-9]*,[0-9]*,[^,.]*,(.*)`
  137. // 匹配 ass 文件中的 Style 变量
  138. regString4Style = `(?m)^Style:\s*(\w+),`
  139. )
  140. type StyleNameInfo struct {
  141. Name string
  142. Count int
  143. }
  144. type StyleNameInfos []StyleNameInfo
  145. func (a StyleNameInfos) Len() int { return len(a) }
  146. func (a StyleNameInfos) Less(i, j int) bool { return a[i].Count < a[j].Count }
  147. func (a StyleNameInfos) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
  148. func sortMapByValue(m map[string]int) StyleNameInfos {
  149. p := make(StyleNameInfos, len(m))
  150. i := 0
  151. for k, v := range m {
  152. p[i] = StyleNameInfo{k, v}
  153. i++
  154. }
  155. sort.Sort(sort.Reverse(p))
  156. return p
  157. }