fixer.go 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368
  1. package sub_timeline_fixer
  2. import (
  3. "fmt"
  4. "github.com/allanpk716/ChineseSubFinder/internal/common"
  5. "github.com/allanpk716/ChineseSubFinder/internal/pkg"
  6. "github.com/allanpk716/ChineseSubFinder/internal/pkg/log_helper"
  7. "github.com/allanpk716/ChineseSubFinder/internal/types/sub_timeline_fiexer"
  8. "github.com/allanpk716/ChineseSubFinder/internal/types/subparser"
  9. "github.com/go-echarts/go-echarts/v2/opts"
  10. "github.com/grd/stat"
  11. "github.com/james-bowman/nlp/measures/pairwise"
  12. "github.com/mndrix/tukey"
  13. "gonum.org/v1/gonum/mat"
  14. "os"
  15. "strings"
  16. "time"
  17. )
  18. type SubTimelineFixer struct {
  19. fixerConfig sub_timeline_fiexer.SubTimelineFixerConfig
  20. }
  21. func NewSubTimelineFixer(fixerConfig sub_timeline_fiexer.SubTimelineFixerConfig) *SubTimelineFixer {
  22. return &SubTimelineFixer{
  23. fixerConfig: fixerConfig,
  24. }
  25. }
  26. // StopWordCounter 停止词统计
  27. func (s *SubTimelineFixer) StopWordCounter(inString string, per int) []string {
  28. statisticTimes := make(map[string]int)
  29. wordsLength := strings.Fields(inString)
  30. for counts, word := range wordsLength {
  31. // 判断key是否存在,这个word是字符串,这个counts是统计的word的次数。
  32. word, ok := statisticTimes[word]
  33. if ok {
  34. word = word
  35. statisticTimes[wordsLength[counts]] = statisticTimes[wordsLength[counts]] + 1
  36. } else {
  37. statisticTimes[wordsLength[counts]] = 1
  38. }
  39. }
  40. stopWords := make([]string, 0)
  41. mapByValue := sortMapByValue(statisticTimes)
  42. breakIndex := len(mapByValue) * per / 100
  43. for index, wordInfo := range mapByValue {
  44. if index > breakIndex {
  45. break
  46. }
  47. stopWords = append(stopWords, wordInfo.Name)
  48. }
  49. return stopWords
  50. }
  51. // GetOffsetTime 暂时只支持英文的基准字幕,源字幕必须是双语中英字幕
  52. func (s *SubTimelineFixer) GetOffsetTime(infoBase, infoSrc *subparser.FileInfo, staticLineFileSavePath string, debugInfoFileSavePath string) (bool, float64, float64, error) {
  53. var debugInfos = make([]string, 0)
  54. // 构建基准语料库,目前阶段只需要考虑是 En 的就行了
  55. var baseCorpus = make([]string, 0)
  56. var baseDialogueFilterMap = make(map[int]int, 0)
  57. /*
  58. 这里原来的写法是所有的 base 的都放进去匹配,这样会带来一些不必要的对白
  59. 需要剔除空白。那么就需要建立一个转换的字典
  60. */
  61. for index, oneDialogueEx := range infoBase.DialoguesEx {
  62. if oneDialogueEx.EnLine == "" {
  63. continue
  64. }
  65. baseCorpus = append(baseCorpus, oneDialogueEx.EnLine)
  66. baseDialogueFilterMap[len(baseCorpus)-1] = index
  67. }
  68. // 初始化
  69. pipLine, tfidf, err := NewTFIDF(baseCorpus)
  70. if err != nil {
  71. return false, 0, 0, err
  72. }
  73. /*
  74. 确认两个字幕间的偏移,暂定的方案是两边都连续匹配上 5 个索引,再抽取一个对话的时间进行修正计算
  75. */
  76. maxCompareDialogue := s.fixerConfig.MaxCompareDialogue
  77. // 基线的长度
  78. _, docsLength := tfidf.Dims()
  79. var matchIndexList = make([]MatchIndex, 0)
  80. sc := NewSubCompare(maxCompareDialogue)
  81. // 开始比较相似度,默认认为是 Ch_en 就行了
  82. for srcIndex := 0; srcIndex < len(infoSrc.DialoguesEx); {
  83. srcOneDialogueEx := infoSrc.DialoguesEx[srcIndex]
  84. // 这里只考虑 英文 的语言
  85. if srcOneDialogueEx.EnLine == "" {
  86. srcIndex++
  87. continue
  88. }
  89. // run the query through the same pipeline that was fitted to the corpus and
  90. // to project it into the same dimensional space
  91. queryVector, err := pipLine.Transform(srcOneDialogueEx.EnLine)
  92. if err != nil {
  93. return false, 0, 0, err
  94. }
  95. // iterate over document feature vectors (columns) in the LSI matrix and compare
  96. // with the query vector for similarity. Similarity is determined by the difference
  97. // between the angles of the vectors known as the cosine similarity
  98. highestSimilarity := -1.0
  99. // 匹配上的基准的索引
  100. var baseIndex int
  101. // 这里理论上需要把所有的基线遍历一次,但是,一般来说,两个字幕不可能差距在 50 行
  102. // 这样的好处是有助于提高搜索的性能
  103. // 那么就以当前的 src 的位置,向前、向后各 50 来遍历
  104. nowMaxScanLength := srcIndex + 50
  105. nowMinScanLength := srcIndex - 50
  106. if nowMinScanLength < 0 {
  107. nowMinScanLength = 0
  108. }
  109. if nowMaxScanLength > docsLength {
  110. nowMaxScanLength = docsLength
  111. }
  112. for i := nowMinScanLength; i < nowMaxScanLength; i++ {
  113. similarity := pairwise.CosineSimilarity(queryVector.(mat.ColViewer).ColView(0), tfidf.(mat.ColViewer).ColView(i))
  114. if similarity > highestSimilarity {
  115. baseIndex = i
  116. highestSimilarity = similarity
  117. }
  118. }
  119. startBaseIndex, startSrcIndex := sc.GetStartIndex()
  120. if sc.Add(baseIndex, srcIndex) == false {
  121. sc.Clear()
  122. srcIndex = startSrcIndex + 1
  123. continue
  124. //sc.Add(baseIndex, srcIndex)
  125. }
  126. if sc.Check() == false {
  127. srcIndex++
  128. continue
  129. } else {
  130. sc.Clear()
  131. }
  132. matchIndexList = append(matchIndexList, MatchIndex{
  133. BaseNowIndex: startBaseIndex,
  134. //BaseNowIndex: baseDialogueFilterMap[startBaseIndex],
  135. SrcNowIndex: startSrcIndex,
  136. Similarity: highestSimilarity,
  137. })
  138. //println(fmt.Sprintf("Similarity: %f Base[%d] %s-%s '%s' <--> Src[%d] %s-%s '%s'",
  139. // highestSimilarity,
  140. // baseIndex, infoBase.DialoguesEx[baseIndex].StartTime, infoBase.DialoguesEx[baseIndex].EndTime, baseCorpus[baseIndex],
  141. // srcIndex, srcOneDialogueEx.StartTime, srcOneDialogueEx.EndTime, srcOneDialogueEx.EnLine))
  142. srcIndex++
  143. }
  144. timeFormat := ""
  145. if infoBase.Ext == common.SubExtASS || infoBase.Ext == common.SubExtSSA {
  146. timeFormat = timeFormatAss
  147. } else {
  148. timeFormat = timeFormatSrt
  149. }
  150. var startDiffTimeLineData = make([]opts.LineData, 0)
  151. var endDiffTimeLineData = make([]opts.LineData, 0)
  152. var tmpStartDiffTime = make([]float64, 0)
  153. var tmpEndDiffTime = make([]float64, 0)
  154. var startDiffTimeList = make(stat.Float64Slice, 0)
  155. var endDiffTimeList = make(stat.Float64Slice, 0)
  156. var xAxis = make([]string, 0)
  157. // 上面找出了连续匹配 maxCompareDialogue:N 次的字幕语句块
  158. // 求出平均时间偏移
  159. for mIndex, matchIndexItem := range matchIndexList {
  160. for i := 0; i < maxCompareDialogue; i++ {
  161. // 这里会统计连续的这 5 句话的时间差
  162. //tmpBaseIndex := matchIndexItem.BaseNowIndex + i
  163. tmpBaseIndex := baseDialogueFilterMap[matchIndexItem.BaseNowIndex+i]
  164. tmpSrcIndex := matchIndexItem.SrcNowIndex + i
  165. baseTimeStart, err := time.Parse(timeFormat, infoBase.DialoguesEx[tmpBaseIndex].StartTime)
  166. if err != nil {
  167. return false, 0, 0, err
  168. }
  169. baseTimeEnd, err := time.Parse(timeFormat, infoBase.DialoguesEx[tmpBaseIndex].EndTime)
  170. if err != nil {
  171. return false, 0, 0, err
  172. }
  173. srtTimeStart, err := time.Parse(timeFormat, infoSrc.DialoguesEx[tmpSrcIndex].StartTime)
  174. if err != nil {
  175. return false, 0, 0, err
  176. }
  177. srtTimeEnd, err := time.Parse(timeFormat, infoSrc.DialoguesEx[tmpSrcIndex].EndTime)
  178. if err != nil {
  179. return false, 0, 0, err
  180. }
  181. TimeDiffStart := baseTimeStart.Sub(srtTimeStart)
  182. TimeDiffEnd := baseTimeEnd.Sub(srtTimeEnd)
  183. startDiffTimeLineData = append(startDiffTimeLineData, opts.LineData{Value: TimeDiffStart.Seconds()})
  184. endDiffTimeLineData = append(endDiffTimeLineData, opts.LineData{Value: TimeDiffEnd.Seconds()})
  185. tmpStartDiffTime = append(tmpStartDiffTime, TimeDiffStart.Seconds())
  186. tmpEndDiffTime = append(tmpEndDiffTime, TimeDiffEnd.Seconds())
  187. startDiffTimeList = append(startDiffTimeList, TimeDiffStart.Seconds())
  188. endDiffTimeList = append(endDiffTimeList, TimeDiffEnd.Seconds())
  189. xAxis = append(xAxis, fmt.Sprintf("%d_%d", mIndex, i))
  190. debugInfos = append(debugInfos, "bs "+infoBase.DialoguesEx[tmpBaseIndex].StartTime+" <-> "+infoBase.DialoguesEx[tmpBaseIndex].EndTime)
  191. debugInfos = append(debugInfos, "sc "+infoSrc.DialoguesEx[tmpSrcIndex].StartTime+" <-> "+infoSrc.DialoguesEx[tmpSrcIndex].EndTime)
  192. debugInfos = append(debugInfos, "StartDiffTime: "+fmt.Sprintf("%f", TimeDiffStart.Seconds()))
  193. //println(fmt.Sprintf("Diff Start-End: %s - %s Base[%d] %s-%s '%s' <--> Src[%d] %s-%s '%s'",
  194. // TimeDiffStart, TimeDiffEnd,
  195. // tmpBaseIndex, infoBase.DialoguesEx[tmpBaseIndex].StartTime, infoBase.DialoguesEx[tmpBaseIndex].EndTime, infoBase.DialoguesEx[tmpBaseIndex].EnLine,
  196. // tmpSrcIndex, infoSrc.DialoguesEx[tmpSrcIndex].StartTime, infoSrc.DialoguesEx[tmpSrcIndex].EndTime, infoSrc.DialoguesEx[tmpSrcIndex].EnLine))
  197. }
  198. debugInfos = append(debugInfos, "---------------------------------------------")
  199. //println("---------------------------------------------")
  200. }
  201. oldMean := stat.Mean(startDiffTimeList)
  202. oldSd := stat.Sd(startDiffTimeList)
  203. newMean := -1.0
  204. newSd := -1.0
  205. per := 1.0
  206. // 如果 SD 较大的时候才需要剔除
  207. if oldSd > 0.1 {
  208. var outliersMap = make(map[float64]int, 0)
  209. outliers, _, _ := tukey.Outliers(0.3, tmpStartDiffTime)
  210. for _, outlier := range outliers {
  211. outliersMap[outlier] = 0
  212. }
  213. var newStartDiffTimeList = make([]float64, 0)
  214. for _, f := range tmpStartDiffTime {
  215. _, ok := outliersMap[f]
  216. if ok == true {
  217. continue
  218. }
  219. newStartDiffTimeList = append(newStartDiffTimeList, f)
  220. }
  221. orgLen := startDiffTimeList.Len()
  222. startDiffTimeList = make(stat.Float64Slice, 0)
  223. for _, f := range newStartDiffTimeList {
  224. startDiffTimeList = append(startDiffTimeList, f)
  225. }
  226. newLen := startDiffTimeList.Len()
  227. per = float64(newLen) / float64(orgLen)
  228. newMean = stat.Mean(startDiffTimeList)
  229. newSd = stat.Sd(startDiffTimeList)
  230. }
  231. if newMean == -1.0 {
  232. newMean = oldMean
  233. }
  234. if newSd == -1.0 {
  235. newSd = oldSd
  236. }
  237. // 不为空的时候,生成调试文件
  238. if staticLineFileSavePath != "" {
  239. //staticLineFileSavePath = "bar.html"
  240. err = SaveStaticLine(staticLineFileSavePath, infoBase.Name, infoSrc.Name,
  241. per, oldMean, oldSd, newMean, newSd, xAxis,
  242. startDiffTimeLineData, endDiffTimeLineData)
  243. if err != nil {
  244. return false, 0, 0, err
  245. }
  246. }
  247. // 跳过的逻辑是 mean 是 0 ,那么现在如果判断有问题,缓存的调试文件继续生成,然后强制返回 0 来跳过后续的逻辑
  248. // 这里需要考虑,找到的连续 5 句话匹配的有多少句,占比整体所有的 Dialogue 是多少,太低也需要跳过
  249. matchIndexLineCount := len(matchIndexList) * maxCompareDialogue
  250. //perMatch := float64(matchIndexLineCount) / float64(len(infoSrc.DialoguesEx))
  251. perMatch := float64(matchIndexLineCount) / float64(len(baseCorpus))
  252. if perMatch < s.fixerConfig.MinMatchedPercent {
  253. tmpContent := infoSrc.Name + fmt.Sprintf(" Sequence match %d dialogues (< %f%%), Skip,", s.fixerConfig.MaxCompareDialogue, s.fixerConfig.MinMatchedPercent*100) + fmt.Sprintf(" %f%% ", perMatch*100)
  254. debugInfos = append(debugInfos, tmpContent)
  255. log_helper.GetLogger().Debugln(tmpContent)
  256. } else {
  257. tmpContent := infoSrc.Name + fmt.Sprintf(" Sequence match %d dialogues,", s.fixerConfig.MaxCompareDialogue) + fmt.Sprintf(" %f%% ", perMatch*100)
  258. debugInfos = append(debugInfos, tmpContent)
  259. log_helper.GetLogger().Debugln(tmpContent)
  260. }
  261. // 输出调试的匹配时间轴信息的列表
  262. if debugInfoFileSavePath != "" {
  263. err = pkg.WriteStrings2File(debugInfoFileSavePath, debugInfos)
  264. if err != nil {
  265. return false, 0, 0, err
  266. }
  267. }
  268. // 虽然有条件判断是认为有问题的,但是返回值还是要填写除去的
  269. if perMatch < s.fixerConfig.MinMatchedPercent {
  270. return false, newMean, newSd, nil
  271. }
  272. return true, newMean, newSd, nil
  273. }
  274. // FixSubTimeline 校正时间轴
  275. func (s *SubTimelineFixer) FixSubTimeline(infoSrc *subparser.FileInfo, inOffsetTime float64, desSaveSubFileFullPath string) (string, error) {
  276. /*
  277. 从解析的实例中,正常来说是可以匹配出所有的 Dialogue 对话的 Start 和 End time 的信息
  278. 然后找到对应的字幕的文件,进行文件内容的替换来做时间轴的校正
  279. */
  280. // 偏移时间
  281. offsetTime := time.Duration(inOffsetTime*1000) * time.Millisecond
  282. timeFormat := ""
  283. if infoSrc.Ext == common.SubExtASS || infoSrc.Ext == common.SubExtSSA {
  284. timeFormat = timeFormatAss
  285. } else {
  286. timeFormat = timeFormatSrt
  287. }
  288. fixContent := infoSrc.Content
  289. for _, srcOneDialogue := range infoSrc.Dialogues {
  290. timeStart, err := time.Parse(timeFormat, srcOneDialogue.StartTime)
  291. if err != nil {
  292. return "", err
  293. }
  294. timeEnd, err := time.Parse(timeFormat, srcOneDialogue.EndTime)
  295. if err != nil {
  296. return "", err
  297. }
  298. fixTimeStart := timeStart.Add(offsetTime)
  299. fixTimeEnd := timeEnd.Add(offsetTime)
  300. fixContent = strings.ReplaceAll(fixContent, srcOneDialogue.StartTime, fixTimeStart.Format(timeFormat))
  301. fixContent = strings.ReplaceAll(fixContent, srcOneDialogue.EndTime, fixTimeEnd.Format(timeFormat))
  302. }
  303. dstFile, err := os.Create(desSaveSubFileFullPath)
  304. if err != nil {
  305. return "", err
  306. }
  307. defer func() {
  308. _ = dstFile.Close()
  309. }()
  310. _, err = dstFile.WriteString(fixContent)
  311. if err != nil {
  312. return "", err
  313. }
  314. return fixContent, nil
  315. }
  316. const timeFormatAss = "15:04:05.00"
  317. const timeFormatSrt = "15:04:05,000"
  318. const FixMask = "-fix"