fixer.go 8.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256
  1. package sub_timeline_fixer
  2. import (
  3. "fmt"
  4. "github.com/allanpk716/ChineseSubFinder/internal/common"
  5. "github.com/allanpk716/ChineseSubFinder/internal/types/subparser"
  6. "github.com/go-echarts/go-echarts/v2/opts"
  7. "github.com/grd/stat"
  8. "github.com/james-bowman/nlp/measures/pairwise"
  9. "github.com/mndrix/tukey"
  10. "gonum.org/v1/gonum/mat"
  11. "strings"
  12. "time"
  13. )
  14. // StopWordCounter 停止词统计
  15. func StopWordCounter(inString string, per int) []string {
  16. statisticTimes := make(map[string]int)
  17. wordsLength := strings.Fields(inString)
  18. for counts, word := range wordsLength {
  19. // 判断key是否存在,这个word是字符串,这个counts是统计的word的次数。
  20. word, ok := statisticTimes[word]
  21. if ok {
  22. word = word
  23. statisticTimes[wordsLength[counts]] = statisticTimes[wordsLength[counts]] + 1
  24. } else {
  25. statisticTimes[wordsLength[counts]] = 1
  26. }
  27. }
  28. stopWords := make([]string, 0)
  29. mapByValue := sortMapByValue(statisticTimes)
  30. breakIndex := len(mapByValue) * per / 100
  31. for index, wordInfo := range mapByValue {
  32. if index > breakIndex {
  33. break
  34. }
  35. stopWords = append(stopWords, wordInfo.Name)
  36. }
  37. return stopWords
  38. }
  39. // GetOffsetTime 暂时只支持英文的基准字幕,源字幕必须是双语中英字幕
  40. func GetOffsetTime(infoBase, infoSrc *subparser.FileInfo, staticLineFPath string) (float64, error) {
  41. if staticLineFPath == "" {
  42. staticLineFPath = "bar.html"
  43. }
  44. // 构建基准语料库,目前阶段只需要考虑是 En 的就行了
  45. var baseCorpus = make([]string, 0)
  46. for _, oneDialogueEx := range infoBase.DialoguesEx {
  47. baseCorpus = append(baseCorpus, oneDialogueEx.EnLine)
  48. }
  49. // 初始化
  50. pipLine, tfidf, err := NewTFIDF(baseCorpus)
  51. if err != nil {
  52. return 0, err
  53. }
  54. /*
  55. 确认两个字幕间的偏移,暂定的方案是两边都连续匹配上 5 个索引,再抽取一个对话的时间进行修正计算
  56. */
  57. maxCompareDialogue := 5
  58. // 基线的长度
  59. _, docsLength := tfidf.Dims()
  60. var matchIndexList = make([]MatchIndex, 0)
  61. sc := NewSubCompare(maxCompareDialogue)
  62. // 开始比较相似度,默认认为是 Ch_en 就行了
  63. for srcIndex, srcOneDialogueEx := range infoSrc.DialoguesEx {
  64. // 这里只考虑 英文 的语言
  65. if srcOneDialogueEx.EnLine == "" {
  66. continue
  67. }
  68. // run the query through the same pipeline that was fitted to the corpus and
  69. // to project it into the same dimensional space
  70. queryVector, err := pipLine.Transform(srcOneDialogueEx.EnLine)
  71. if err != nil {
  72. return 0, err
  73. }
  74. // iterate over document feature vectors (columns) in the LSI matrix and compare
  75. // with the query vector for similarity. Similarity is determined by the difference
  76. // between the angles of the vectors known as the cosine similarity
  77. highestSimilarity := -1.0
  78. // 匹配上的基准的索引
  79. var baseIndex int
  80. // 这里理论上需要把所有的基线遍历一次,但是,一般来说,两个字幕不可能差距在 50 行
  81. // 这样的好处是有助于提高搜索的性能
  82. // 那么就以当前的 src 的位置,向前、向后各 50 来遍历
  83. nowMaxScanLength := srcIndex + 50
  84. nowMinScanLength := srcIndex - 50
  85. if nowMinScanLength < 0 {
  86. nowMinScanLength = 0
  87. }
  88. if nowMaxScanLength > docsLength {
  89. nowMaxScanLength = docsLength
  90. }
  91. for i := nowMinScanLength; i < nowMaxScanLength; i++ {
  92. similarity := pairwise.CosineSimilarity(queryVector.(mat.ColViewer).ColView(0), tfidf.(mat.ColViewer).ColView(i))
  93. if similarity > highestSimilarity {
  94. baseIndex = i
  95. highestSimilarity = similarity
  96. }
  97. }
  98. if sc.Add(baseIndex, srcIndex) == false {
  99. sc.Clear()
  100. sc.Add(baseIndex, srcIndex)
  101. }
  102. if sc.Check() == false {
  103. continue
  104. }
  105. startBaseIndex, startSrcIndex := sc.GetStartIndex()
  106. matchIndexList = append(matchIndexList, MatchIndex{
  107. BaseNowIndex: startBaseIndex,
  108. SrcNowIndex: startSrcIndex,
  109. Similarity: highestSimilarity,
  110. })
  111. //println(fmt.Sprintf("Similarity: %f Base[%d] %s-%s '%s' <--> Src[%d] %s-%s '%s'",
  112. // highestSimilarity,
  113. // baseIndex, infoBase.DialoguesEx[baseIndex].StartTime, infoBase.DialoguesEx[baseIndex].EndTime, baseCorpus[baseIndex],
  114. // srcIndex, srcOneDialogueEx.StartTime, srcOneDialogueEx.EndTime, srcOneDialogueEx.EnLine))
  115. }
  116. timeFormat := ""
  117. if infoBase.Ext == common.SubExtASS || infoBase.Ext == common.SubExtSSA {
  118. timeFormat = timeFormatAss
  119. } else {
  120. timeFormat = timeFormatSrt
  121. }
  122. var startDiffTimeLineData = make([]opts.LineData, 0)
  123. var endDiffTimeLineData = make([]opts.LineData, 0)
  124. var tmpStartDiffTime = make([]float64, 0)
  125. var tmpEndDiffTime = make([]float64, 0)
  126. var startDiffTimeList = make(stat.Float64Slice, 0)
  127. var endDiffTimeList = make(stat.Float64Slice, 0)
  128. var xAxis = make([]string, 0)
  129. // 上面找出了连续匹配 maxCompareDialogue:N 次的字幕语句块
  130. // 求出平均时间偏移
  131. for mIndex, matchIndexItem := range matchIndexList {
  132. for i := 0; i < maxCompareDialogue; i++ {
  133. // 这里会统计连续的这 5 句话的时间差
  134. tmpBaseIndex := matchIndexItem.BaseNowIndex + i
  135. tmpSrcIndex := matchIndexItem.SrcNowIndex + i
  136. baseTimeStart, err := time.Parse(timeFormat, infoBase.DialoguesEx[tmpBaseIndex].StartTime)
  137. if err != nil {
  138. return 0, err
  139. }
  140. baseTimeEnd, err := time.Parse(timeFormat, infoBase.DialoguesEx[tmpBaseIndex].EndTime)
  141. if err != nil {
  142. return 0, err
  143. }
  144. srtTimeStart, err := time.Parse(timeFormat, infoSrc.DialoguesEx[tmpSrcIndex].StartTime)
  145. if err != nil {
  146. return 0, err
  147. }
  148. srtTimeEnd, err := time.Parse(timeFormat, infoSrc.DialoguesEx[tmpSrcIndex].EndTime)
  149. if err != nil {
  150. return 0, err
  151. }
  152. TimeDiffStart := baseTimeStart.Sub(srtTimeStart)
  153. TimeDiffEnd := baseTimeEnd.Sub(srtTimeEnd)
  154. startDiffTimeLineData = append(startDiffTimeLineData, opts.LineData{Value: TimeDiffStart.Seconds()})
  155. endDiffTimeLineData = append(endDiffTimeLineData, opts.LineData{Value: TimeDiffEnd.Seconds()})
  156. tmpStartDiffTime = append(tmpStartDiffTime, TimeDiffStart.Seconds())
  157. tmpEndDiffTime = append(tmpEndDiffTime, TimeDiffEnd.Seconds())
  158. startDiffTimeList = append(startDiffTimeList, TimeDiffStart.Seconds())
  159. endDiffTimeList = append(endDiffTimeList, TimeDiffEnd.Seconds())
  160. xAxis = append(xAxis, fmt.Sprintf("%d_%d", mIndex, i))
  161. //println(fmt.Sprintf("Diff Start-End: %s - %s Base[%d] %s-%s '%s' <--> Src[%d] %s-%s '%s'",
  162. // TimeDiffStart, TimeDiffEnd,
  163. // tmpBaseIndex, infoBase.DialoguesEx[tmpBaseIndex].StartTime, infoBase.DialoguesEx[tmpBaseIndex].EndTime, infoBase.DialoguesEx[tmpBaseIndex].EnLine,
  164. // tmpSrcIndex, infoSrc.DialoguesEx[tmpSrcIndex].StartTime, infoSrc.DialoguesEx[tmpSrcIndex].EndTime, infoSrc.DialoguesEx[tmpSrcIndex].EnLine))
  165. }
  166. //println("---------------------------------------------")
  167. }
  168. oldMean := stat.Mean(startDiffTimeList)
  169. oldSd := stat.Sd(startDiffTimeList)
  170. newMean := -1.0
  171. newSd := -1.0
  172. per := 1.0
  173. // 如果 SD 较大的时候才需要剔除
  174. if oldSd > 0.1 {
  175. var outliersMap = make(map[float64]int, 0)
  176. outliers, _, _ := tukey.Outliers(0.3, tmpStartDiffTime)
  177. for _, outlier := range outliers {
  178. outliersMap[outlier] = 0
  179. }
  180. var newStartDiffTimeList = make([]float64, 0)
  181. for _, f := range tmpStartDiffTime {
  182. _, ok := outliersMap[f]
  183. if ok == true {
  184. continue
  185. }
  186. newStartDiffTimeList = append(newStartDiffTimeList, f)
  187. }
  188. orgLen := startDiffTimeList.Len()
  189. startDiffTimeList = make(stat.Float64Slice, 0)
  190. for _, f := range newStartDiffTimeList {
  191. startDiffTimeList = append(startDiffTimeList, f)
  192. }
  193. newLen := startDiffTimeList.Len()
  194. per = float64(newLen) / float64(orgLen)
  195. newMean = stat.Mean(startDiffTimeList)
  196. newSd = stat.Sd(startDiffTimeList)
  197. }
  198. if newMean == -1.0 {
  199. newMean = oldMean
  200. }
  201. if newSd == -1.0 {
  202. newSd = oldSd
  203. }
  204. err = SaveStaticLine(staticLineFPath, infoBase.Name, infoSrc.Name,
  205. per, oldMean, oldSd, newMean, newSd, xAxis,
  206. startDiffTimeLineData, endDiffTimeLineData)
  207. if err != nil {
  208. return 0, err
  209. }
  210. return newMean, nil
  211. }
  212. // FixSubTimeline 校正时间轴
  213. func FixSubTimeline(infoSrc *subparser.FileInfo, offsetTime float64, desSaveSubFPath string) {
  214. /*
  215. 从解析的实例中,正常来说是可以匹配出所有的 Dialogue 对话的 Start 和 End time 的信息
  216. 然后找到对应的字幕的文件,进行文件内容的替换来做时间轴的校正
  217. */
  218. }
  219. const timeFormatAss = "15:04:05.00"
  220. const timeFormatSrt = "15:04:05,000"