pipeline.go 8.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271
  1. package sub_timeline_fixer
  2. import (
  3. "errors"
  4. "fmt"
  5. "github.com/allanpk716/ChineseSubFinder/internal/pkg/gss"
  6. "github.com/allanpk716/ChineseSubFinder/internal/pkg/my_util"
  7. "github.com/allanpk716/ChineseSubFinder/internal/pkg/sub_helper"
  8. "github.com/allanpk716/ChineseSubFinder/internal/pkg/vad"
  9. "github.com/allanpk716/ChineseSubFinder/internal/types/subparser"
  10. "github.com/huandu/go-clone"
  11. "os"
  12. "sort"
  13. "strings"
  14. "time"
  15. )
  16. type Pipeline struct {
  17. MaxOffsetSeconds int
  18. framerateRatios []float64
  19. }
  20. func NewPipeline(maxOffsetSeconds int) *Pipeline {
  21. return &Pipeline{
  22. MaxOffsetSeconds: maxOffsetSeconds,
  23. framerateRatios: make([]float64, 0),
  24. }
  25. }
  26. func (p Pipeline) CalcOffsetTime(infoBase, infoSrc *subparser.FileInfo, audioVadList []vad.VADInfo, useGSS bool) (PipeResult, error) {
  27. baseVADInfo := make([]float64, 0)
  28. useSubtitleOrAudioAsBase := false
  29. // 排序
  30. infoSrc.SortDialogues()
  31. if infoBase == nil && audioVadList != nil {
  32. baseVADInfo = vad.GetFloatSlice(audioVadList)
  33. useSubtitleOrAudioAsBase = true
  34. } else if infoBase != nil {
  35. useSubtitleOrAudioAsBase = false
  36. // 排序
  37. infoBase.SortDialogues()
  38. // 解析处 VAD 信息
  39. baseUnitNew, err := sub_helper.GetVADInfoFeatureFromSubNew(infoBase, 0)
  40. if err != nil {
  41. return PipeResult{}, err
  42. }
  43. baseVADInfo = baseUnitNew.GetVADFloatSlice()
  44. } else {
  45. return PipeResult{}, errors.New("FixTimeline input is error")
  46. }
  47. pipeResults := make([]PipeResult, 0)
  48. /*
  49. 这里复现 ffsubsync 的思路
  50. 1. 首先由 getFramerateRatios2Try 得到多个帧数比率的数值,理论上有以下 7 个值:
  51. 将 frameRateRatio = 1.0 插入到 framerateRatios 这个队列的首位
  52. [0] 1.0
  53. [1] 1.001001001001001
  54. [2] 1.0427093760427095
  55. [3] 1.0416666666666667
  56. [4] 0.9989999999999999
  57. [5] 0.9590399999999999
  58. [6] 0.96
  59. 得到一个 framerateRatios 列表
  60. 2. 计算 base 字幕的 num_frames,以及 frameRateRatio = 1.0 时 src 字幕的 num_frames
  61. 推断 frame ratio 比率是多少,得到一个,inferred_framerate_ratio_from_length = base / src
  62. 把这个值插入到 framerateRatios 的尾部也就是第八个元素
  63. 3. 使用上述的 framerateRatios 作为传入参数,开始 FFT 模块的 fit 计算,得到(分数、偏移)信息,选择分数最大的作为匹配的结论
  64. */
  65. // 1.
  66. framerateRatios := make([]float64, 0)
  67. framerateRatios = p.getFramerateRatios2Try()
  68. // 2.
  69. if useSubtitleOrAudioAsBase == false {
  70. inferredFramerateRatioFromLength := float64(infoBase.GetNumFrames()) / float64(infoSrc.GetNumFrames())
  71. framerateRatios = append(framerateRatios, inferredFramerateRatioFromLength)
  72. }
  73. // 3.
  74. fffAligner := NewFFTAligner(p.MaxOffsetSeconds, SampleRate)
  75. // 需要在这个偏移之下
  76. maxOffsetSamples := p.MaxOffsetSeconds * SampleRate
  77. if maxOffsetSamples < 0 {
  78. maxOffsetSamples = -maxOffsetSamples
  79. }
  80. for _, framerateRatio := range framerateRatios {
  81. /*
  82. ffsubsync 的 pipeline 有这三个步骤
  83. 1. parse 解析字幕
  84. 2. scale 根据帧数比率调整时间轴
  85. 3. speech_extract 从字幕转换为 VAD 的语音检测信息
  86. */
  87. // 外部传入
  88. // 1. parse 解析字幕
  89. tmpInfoSrc := clone.Clone(infoSrc).(*subparser.FileInfo)
  90. // 2. scale 根据帧数比率调整时间轴
  91. err := tmpInfoSrc.ChangeDialoguesTimeByFramerateRatio(framerateRatio)
  92. if err != nil {
  93. // 还原
  94. println("ChangeDialoguesTimeByFramerateRatio", err)
  95. tmpInfoSrc = clone.Clone(infoSrc).(*subparser.FileInfo)
  96. }
  97. // 3. speech_extract 从字幕转换为 VAD 的语音检测信息
  98. tmpSrcInfoUnit, err := sub_helper.GetVADInfoFeatureFromSubNew(tmpInfoSrc, 0)
  99. if err != nil {
  100. return PipeResult{}, err
  101. }
  102. bestOffset, score := fffAligner.Fit(baseVADInfo, tmpSrcInfoUnit.GetVADFloatSlice())
  103. pipeResult := PipeResult{
  104. Score: score,
  105. BestOffset: bestOffset,
  106. ScaleFactor: framerateRatio,
  107. ScaledFileInfo: tmpInfoSrc,
  108. }
  109. pipeResults = append(pipeResults, pipeResult)
  110. }
  111. if useGSS == true {
  112. // 最后一个才需要额外使用 GSS
  113. // 使用 GSS
  114. optFunc := func(framerateRatio float64, isLastIter bool) float64 {
  115. // 1. parse 解析字幕
  116. tmpInfoSrc := clone.Clone(infoSrc).(*subparser.FileInfo)
  117. // 2. scale 根据帧数比率调整时间轴
  118. err := tmpInfoSrc.ChangeDialoguesTimeByFramerateRatio(framerateRatio)
  119. if err != nil {
  120. // 还原
  121. println("ChangeDialoguesTimeByFramerateRatio", err)
  122. tmpInfoSrc = clone.Clone(infoSrc).(*subparser.FileInfo)
  123. }
  124. // 3. speech_extract 从字幕转换为 VAD 的语音检测信息
  125. tmpSrcInfoUnit, err := sub_helper.GetVADInfoFeatureFromSubNew(tmpInfoSrc, 0)
  126. if err != nil {
  127. return 0
  128. }
  129. // 然后进行 base 与 src 匹配计算,将每一次变动 framerateRatio 计算得到的 偏移值和分数进行记录
  130. bestOffset, score := fffAligner.Fit(baseVADInfo, tmpSrcInfoUnit.GetVADFloatSlice())
  131. println(fmt.Sprintf("got score %.0f (offset %d) for ratio %.3f", score, bestOffset, framerateRatio))
  132. // 放到外部的存储中
  133. if isLastIter == true {
  134. pipeResult := PipeResult{
  135. Score: score,
  136. BestOffset: bestOffset,
  137. ScaleFactor: framerateRatio,
  138. ScaledFileInfo: tmpInfoSrc,
  139. }
  140. pipeResults = append(pipeResults, pipeResult)
  141. }
  142. return -score
  143. }
  144. gss.Gss(optFunc, MinFramerateRatio, MaxFramerateRatio, 1e-4, nil)
  145. }
  146. // 先进行过滤
  147. filterPipeResults := make([]PipeResult, 0)
  148. for _, result := range pipeResults {
  149. if result.BestOffset < maxOffsetSamples {
  150. filterPipeResults = append(filterPipeResults, result)
  151. }
  152. }
  153. if len(filterPipeResults) <= 0 {
  154. return PipeResult{}, errors.New(fmt.Sprintf("AutoFixTimeline failed; you can set 'MaxOffSetTime' > %d", p.MaxOffsetSeconds) +
  155. fmt.Sprintf(" Or this two subtiles are not fited to this video!"))
  156. }
  157. // 从得到的结果里面找到分数最高的
  158. sort.Sort(PipeResults(filterPipeResults))
  159. maxPipeResult := filterPipeResults[len(filterPipeResults)-1]
  160. return maxPipeResult, nil
  161. }
  162. // FixSubFileTimeline 这里传入的 scaledInfoSrc 是从 pipeResults 筛选出来的最大分数的 FileInfo
  163. // infoSrc 是从源文件读取出来的,这样才能正确匹配 Content 中的时间戳
  164. func (p Pipeline) FixSubFileTimeline(infoSrc, scaledInfoSrc *subparser.FileInfo, inOffsetTime float64, desSaveSubFileFullPath string) (string, error) {
  165. /*
  166. 从解析的实例中,正常来说是可以匹配出所有的 Dialogue 对话的 Start 和 End time 的信息
  167. 然后找到对应的字幕的文件,进行文件内容的替换来做时间轴的校正
  168. */
  169. // 偏移时间
  170. offsetTime := time.Duration(inOffsetTime*1000) * time.Millisecond
  171. fixContent := scaledInfoSrc.Content
  172. /*
  173. 这里进行时间转字符串的时候有一点比较特殊
  174. 正常来说输出的格式是类似 15:04:05.00
  175. 那么有个问题,字幕的时间格式是 0:00:12.00, 小时,是个数,除非有跨度到 20 小时的视频,不然小时就应该是个数
  176. 这就需要一个额外的函数去处理这些情况
  177. */
  178. timeFormat := scaledInfoSrc.GetTimeFormat()
  179. for index, scaledSrcOneDialogue := range scaledInfoSrc.Dialogues {
  180. timeStart, err := my_util.ParseTime(scaledSrcOneDialogue.StartTime)
  181. if err != nil {
  182. return "", err
  183. }
  184. timeEnd, err := my_util.ParseTime(scaledSrcOneDialogue.EndTime)
  185. if err != nil {
  186. return "", err
  187. }
  188. fixTimeStart := timeStart.Add(offsetTime)
  189. fixTimeEnd := timeEnd.Add(offsetTime)
  190. fixContent = strings.ReplaceAll(fixContent, infoSrc.Dialogues[index].StartTime, my_util.Time2SubTimeString(fixTimeStart, timeFormat))
  191. fixContent = strings.ReplaceAll(fixContent, infoSrc.Dialogues[index].EndTime, my_util.Time2SubTimeString(fixTimeEnd, timeFormat))
  192. }
  193. dstFile, err := os.Create(desSaveSubFileFullPath)
  194. if err != nil {
  195. return "", err
  196. }
  197. defer func() {
  198. _ = dstFile.Close()
  199. }()
  200. _, err = dstFile.WriteString(fixContent)
  201. if err != nil {
  202. return "", err
  203. }
  204. return fixContent, nil
  205. }
  206. func (p *Pipeline) getFramerateRatios2Try() []float64 {
  207. if len(p.framerateRatios) > 0 {
  208. return p.framerateRatios
  209. }
  210. p.framerateRatios = append(p.framerateRatios, 1.0)
  211. p.framerateRatios = append(p.framerateRatios, FramerateRatios...)
  212. for i := 0; i < len(FramerateRatios); i++ {
  213. p.framerateRatios = append(p.framerateRatios, 1.0/FramerateRatios[i])
  214. }
  215. return p.framerateRatios
  216. }
  217. var FramerateRatios = []float64{24. / 23.976, 25. / 23.976, 25. / 24.}
  218. const MinFramerateRatio = 0.9
  219. const MaxFramerateRatio = 1.1
  220. const DefaultMaxOffsetSeconds = 60
  221. const SampleRate = 100
  222. type PipeResult struct {
  223. Score float64
  224. BestOffset int
  225. ScaleFactor float64
  226. ScaledFileInfo *subparser.FileInfo
  227. }
  228. // GetOffsetTime 从偏移得到偏移时间
  229. func (p PipeResult) GetOffsetTime() float64 {
  230. return float64(p.BestOffset) / 100.0
  231. }
  232. type PipeResults []PipeResult
  233. func (d PipeResults) Len() int {
  234. return len(d)
  235. }
  236. func (d PipeResults) Swap(i, j int) {
  237. d[i], d[j] = d[j], d[i]
  238. }
  239. func (d PipeResults) Less(i, j int) bool {
  240. return d[i].Score < d[j].Score
  241. }