ass.go 5.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156
  1. package ass
  2. import (
  3. "github.com/allanpk716/ChineseSubFinder/internal/common"
  4. "github.com/allanpk716/ChineseSubFinder/internal/pkg/language"
  5. "github.com/allanpk716/ChineseSubFinder/internal/types/subparser"
  6. "io/ioutil"
  7. "path/filepath"
  8. "regexp"
  9. "sort"
  10. "strings"
  11. )
  12. type Parser struct {
  13. }
  14. func NewParser() *Parser {
  15. return &Parser{}
  16. }
  17. func (p Parser) GetParserName() string {
  18. return "ass"
  19. }
  20. // DetermineFileTypeFromFile 确定字幕文件的类型,是双语字幕或者某一种语言等等信息
  21. func (p Parser) DetermineFileTypeFromFile(filePath string) (*subparser.FileInfo, error) {
  22. nowExt := filepath.Ext(filePath)
  23. if strings.ToLower(nowExt) != common.SubExtASS && strings.ToLower(nowExt) != common.SubExtSSA {
  24. return nil ,nil
  25. }
  26. fBytes, err := ioutil.ReadFile(filePath)
  27. if err != nil {
  28. return nil ,err
  29. }
  30. inBytes, err := language.ChangeFileCoding2UTF8(fBytes)
  31. if err != nil {
  32. return nil, err
  33. }
  34. return p.DetermineFileTypeFromBytes(inBytes, nowExt)
  35. }
  36. // DetermineFileTypeFromBytes 确定字幕文件的类型,是双语字幕或者某一种语言等等信息
  37. func (p Parser) DetermineFileTypeFromBytes(inBytes []byte, nowExt string) (*subparser.FileInfo, error){
  38. allString :=string(inBytes)
  39. // 注意,需要替换掉 \r 不然正则表达式会有问题
  40. allString = strings.ReplaceAll(allString, "\r", "")
  41. re := regexp.MustCompile(regString)
  42. // 找到 start end text
  43. matched := re.FindAllStringSubmatch(allString, -1)
  44. if len(matched) < 1 {
  45. return nil ,nil
  46. }
  47. subFileInfo := subparser.FileInfo{}
  48. subFileInfo.Ext = nowExt
  49. subFileInfo.Dialogues = make([]subparser.OneDialogue, 0)
  50. // 这里需要统计一共有几个 \N,以及这个数量在整体行数中的比例,这样就知道是不是双语字幕了
  51. countLineFeed := 0
  52. // 有意义的对话统计数,排除 Style 类型
  53. usefullDialogueCount := 0
  54. // 先进行字幕 StyleName 的出现次数排序,找到最多的,就是常规字幕的,不是特效的
  55. var nameMap = make(map[string]int)
  56. for _, oneLine := range matched {
  57. nowStyleName := oneLine[3]
  58. _, ok := nameMap[nowStyleName]
  59. if ok == false {
  60. nameMap[nowStyleName] = 1
  61. } else {
  62. nameMap[nowStyleName]++
  63. }
  64. }
  65. mapByValue := sortMapByValue(nameMap)
  66. // 先读取一次字幕文件
  67. for _, oneLine := range matched {
  68. // 排除特效内容,只统计有意义的对话部分
  69. if strings.Contains(oneLine[0], mapByValue[0].Name) == false {
  70. continue
  71. }
  72. usefullDialogueCount++
  73. startTime := oneLine[1]
  74. endTime := oneLine[2]
  75. nowStyleName := oneLine[3]
  76. nowText := oneLine[4]
  77. odl := subparser.OneDialogue{
  78. StyleName: nowStyleName,
  79. StartTime: startTime,
  80. EndTime: endTime,
  81. }
  82. odl.Lines = make([]string, 0)
  83. // nowText 优先移除 \h 这个是替换空格, \h 是让两个词在一行,不换行显示
  84. nowText = strings.ReplaceAll(nowText, `\h` , " ")
  85. // nowText 这个需要先把 {} 花括号内的内容给移除
  86. var re = regexp.MustCompile(`(?i){.*}`)
  87. nowText1 := re.ReplaceAllString(nowText, "")
  88. nowText1 = strings.TrimRight(nowText1, "\r")
  89. // 然后判断是否有 \N 或者 \n
  90. // 直接把 \n 替换为 \N 来解析
  91. nowText1 = strings.ReplaceAll(nowText1, `\n` , `\N`)
  92. if strings.Contains(nowText1,`\N`) {
  93. // 有,那么就需要再次切割,一般是双语字幕
  94. var re2 = regexp.MustCompile(`(?i)(.*)\\N(.*)`)
  95. for _, matched2 := range re2.FindAllStringSubmatch(nowText1, -1) {
  96. for i, s := range matched2 {
  97. if i == 0 {continue}
  98. odl.Lines = append(odl.Lines, s)
  99. }
  100. }
  101. countLineFeed++
  102. } else {
  103. // 无,则可以直接添加
  104. odl.Lines = append(odl.Lines, nowText1)
  105. }
  106. subFileInfo.Dialogues = append(subFileInfo.Dialogues, odl)
  107. }
  108. // 再分析
  109. // 需要判断每一个 Line 是啥语言,[语言的code]次数
  110. var langDict map[int]int
  111. langDict = make(map[int]int)
  112. var chLines = make([]string, 0)
  113. for _, dialogue := range subFileInfo.Dialogues {
  114. language.DetectSubLangAndStatistics(dialogue.Lines, langDict, &chLines)
  115. }
  116. // 从统计出来的字典,找出 Top 1 或者 2 的出来,然后计算出是什么语言的字幕
  117. detectLang := language.SubLangStatistics2SubLangType(float32(countLineFeed), float32(usefullDialogueCount), langDict, chLines)
  118. subFileInfo.Lang = detectLang
  119. subFileInfo.Data = inBytes
  120. return &subFileInfo, nil
  121. }
  122. const (
  123. // 字幕文件对话的每一行
  124. //regString = `Dialogue: [^,.]*[0-9]*,([1-9]?[0-9]*:[0-9]*:[0-9]*.[0-9]*),([1-9]?[0-9]*:[0-9]*:[0-9]*.[0-9]*),[^,.]*,[^,.]*,[0-9]*,[0-9]*,[0-9]*,[^,.]*,(.*)`
  125. regString = `Dialogue: [^,.]*[0-9]*,([1-9]?[0-9]*:[0-9]*:[0-9]*.[0-9]*),([1-9]?[0-9]*:[0-9]*:[0-9]*.[0-9]*),([^,.]*),[^,.]*,[0-9]*,[0-9]*,[0-9]*,[^,.]*,(.*)`
  126. // 匹配 ass 文件中的 Style 变量
  127. regString4Style = `(?m)^Style:\s*(\w+),`
  128. )
  129. type StyleNameInfo struct {
  130. Name string
  131. Count int
  132. }
  133. type StyleNameInfos []StyleNameInfo
  134. func (a StyleNameInfos) Len() int { return len(a) }
  135. func (a StyleNameInfos) Less(i, j int) bool { return a[i].Count < a[j].Count }
  136. func (a StyleNameInfos) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
  137. func sortMapByValue(m map[string]int) StyleNameInfos {
  138. p := make(StyleNameInfos, len(m))
  139. i := 0
  140. for k, v := range m {
  141. p[i] = StyleNameInfo{k, v}
  142. i++
  143. }
  144. sort.Sort(sort.Reverse(p))
  145. return p
  146. }