fixer.go 9.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277
  1. package sub_timeline_fixer
  2. import (
  3. "errors"
  4. "fmt"
  5. "github.com/allanpk716/ChineseSubFinder/internal/common"
  6. "github.com/allanpk716/ChineseSubFinder/internal/types/subparser"
  7. "github.com/go-echarts/go-echarts/v2/opts"
  8. "github.com/grd/stat"
  9. "github.com/james-bowman/nlp"
  10. "github.com/james-bowman/nlp/measures/pairwise"
  11. "github.com/mndrix/tukey"
  12. "gonum.org/v1/gonum/mat"
  13. "strings"
  14. "time"
  15. )
  16. // StopWordCounter 停止词统计
  17. func StopWordCounter(inString string, per int) []string {
  18. statisticTimes := make(map[string]int)
  19. wordsLength := strings.Fields(inString)
  20. for counts, word := range wordsLength {
  21. // 判断key是否存在,这个word是字符串,这个counts是统计的word的次数。
  22. word, ok := statisticTimes[word]
  23. if ok {
  24. word = word
  25. statisticTimes[wordsLength[counts]] = statisticTimes[wordsLength[counts]] + 1
  26. } else {
  27. statisticTimes[wordsLength[counts]] = 1
  28. }
  29. }
  30. stopWords := make([]string, 0)
  31. mapByValue := sortMapByValue(statisticTimes)
  32. breakIndex := len(mapByValue) * per / 100
  33. for index, wordInfo := range mapByValue {
  34. if index > breakIndex {
  35. break
  36. }
  37. stopWords = append(stopWords, wordInfo.Name)
  38. }
  39. return stopWords
  40. }
  41. // NewTFIDF 初始化 TF-IDF
  42. func NewTFIDF(testCorpus []string) (*nlp.Pipeline, mat.Matrix, error) {
  43. newCountVectoriser := nlp.NewCountVectoriser(StopWords...)
  44. transformer := nlp.NewTfidfTransformer()
  45. // set k (the number of dimensions following truncation) to 4
  46. reducer := nlp.NewTruncatedSVD(4)
  47. lsiPipeline := nlp.NewPipeline(newCountVectoriser, transformer, reducer)
  48. // Transform the corpus into an LSI fitting the model to the documents in the process
  49. lsi, err := lsiPipeline.FitTransform(testCorpus...)
  50. if err != nil {
  51. return nil, lsi, errors.New(fmt.Sprintf("Failed to process testCorpus documents because %v", err))
  52. }
  53. return lsiPipeline, lsi, nil
  54. }
  55. // GetOffsetTime 暂时只支持英文的基准字幕,源字幕必须是双语中英字幕
  56. func GetOffsetTime(infoBase, infoSrc *subparser.FileInfo, staticLineFPath string) (float64, error) {
  57. if staticLineFPath == "" {
  58. staticLineFPath = "bar.html"
  59. }
  60. // 构建基准语料库,目前阶段只需要考虑是 En 的就行了
  61. var baseCorpus = make([]string, 0)
  62. for _, oneDialogueEx := range infoBase.DialoguesEx {
  63. baseCorpus = append(baseCorpus, oneDialogueEx.EnLine)
  64. }
  65. // 初始化
  66. pipLine, tfidf, err := NewTFIDF(baseCorpus)
  67. if err != nil {
  68. return 0, err
  69. }
  70. /*
  71. 确认两个字幕间的偏移,暂定的方案是两边都连续匹配上 5 个索引,再抽取一个对话的时间进行修正计算
  72. */
  73. maxCompareDialogue := 5
  74. // 基线的长度
  75. _, docsLength := tfidf.Dims()
  76. var matchIndexList = make([]MatchIndex, 0)
  77. sc := NewSubCompare(maxCompareDialogue)
  78. // 开始比较相似度,默认认为是 Ch_en 就行了
  79. for srcIndex, srcOneDialogueEx := range infoSrc.DialoguesEx {
  80. // 这里只考虑 英文 的语言
  81. if srcOneDialogueEx.EnLine == "" {
  82. continue
  83. }
  84. // run the query through the same pipeline that was fitted to the corpus and
  85. // to project it into the same dimensional space
  86. queryVector, err := pipLine.Transform(srcOneDialogueEx.EnLine)
  87. if err != nil {
  88. return 0, err
  89. }
  90. // iterate over document feature vectors (columns) in the LSI matrix and compare
  91. // with the query vector for similarity. Similarity is determined by the difference
  92. // between the angles of the vectors known as the cosine similarity
  93. highestSimilarity := -1.0
  94. // 匹配上的基准的索引
  95. var baseIndex int
  96. // 这里理论上需要把所有的基线遍历一次,但是,一般来说,两个字幕不可能差距在 50 行
  97. // 这样的好处是有助于提高搜索的性能
  98. // 那么就以当前的 src 的位置,向前、向后各 50 来遍历
  99. nowMaxScanLength := srcIndex + 50
  100. nowMinScanLength := srcIndex - 50
  101. if nowMinScanLength < 0 {
  102. nowMinScanLength = 0
  103. }
  104. if nowMaxScanLength > docsLength {
  105. nowMaxScanLength = docsLength
  106. }
  107. for i := nowMinScanLength; i < nowMaxScanLength; i++ {
  108. similarity := pairwise.CosineSimilarity(queryVector.(mat.ColViewer).ColView(0), tfidf.(mat.ColViewer).ColView(i))
  109. if similarity > highestSimilarity {
  110. baseIndex = i
  111. highestSimilarity = similarity
  112. }
  113. }
  114. if sc.Add(baseIndex, srcIndex) == false {
  115. sc.Clear()
  116. sc.Add(baseIndex, srcIndex)
  117. }
  118. if sc.Check() == false {
  119. continue
  120. }
  121. startBaseIndex, startSrcIndex := sc.GetStartIndex()
  122. matchIndexList = append(matchIndexList, MatchIndex{
  123. BaseNowIndex: startBaseIndex,
  124. SrcNowIndex: startSrcIndex,
  125. Similarity: highestSimilarity,
  126. })
  127. //println(fmt.Sprintf("Similarity: %f Base[%d] %s-%s '%s' <--> Src[%d] %s-%s '%s'",
  128. // highestSimilarity,
  129. // baseIndex, infoBase.DialoguesEx[baseIndex].StartTime, infoBase.DialoguesEx[baseIndex].EndTime, baseCorpus[baseIndex],
  130. // srcIndex, srcOneDialogueEx.StartTime, srcOneDialogueEx.EndTime, srcOneDialogueEx.EnLine))
  131. }
  132. timeFormat := ""
  133. if infoBase.Ext == common.SubExtASS || infoBase.Ext == common.SubExtSSA {
  134. timeFormat = timeFormatAss
  135. } else {
  136. timeFormat = timeFormatSrt
  137. }
  138. var startDiffTimeLineData = make([]opts.LineData, 0)
  139. var endDiffTimeLineData = make([]opts.LineData, 0)
  140. var tmpStartDiffTime = make([]float64, 0)
  141. var tmpEndDiffTime = make([]float64, 0)
  142. var startDiffTimeList = make(stat.Float64Slice, 0)
  143. var endDiffTimeList = make(stat.Float64Slice, 0)
  144. var xAxis = make([]string, 0)
  145. // 上面找出了连续匹配 maxCompareDialogue:N 次的字幕语句块
  146. // 求出平均时间偏移
  147. for mIndex, matchIndexItem := range matchIndexList {
  148. for i := 0; i < maxCompareDialogue; i++ {
  149. // 这里会统计连续的这 5 句话的时间差
  150. tmpBaseIndex := matchIndexItem.BaseNowIndex + i
  151. tmpSrcIndex := matchIndexItem.SrcNowIndex + i
  152. baseTimeStart, err := time.Parse(timeFormat, infoBase.DialoguesEx[tmpBaseIndex].StartTime)
  153. if err != nil {
  154. println("baseTimeStart", err)
  155. continue
  156. }
  157. baseTimeEnd, err := time.Parse(timeFormat, infoBase.DialoguesEx[tmpBaseIndex].EndTime)
  158. if err != nil {
  159. println("baseTimeEnd", err)
  160. continue
  161. }
  162. srtTimeStart, err := time.Parse(timeFormat, infoSrc.DialoguesEx[tmpSrcIndex].StartTime)
  163. if err != nil {
  164. println("srtTimeStart", err)
  165. continue
  166. }
  167. srtTimeEnd, err := time.Parse(timeFormat, infoSrc.DialoguesEx[tmpSrcIndex].EndTime)
  168. if err != nil {
  169. println("srtTimeEnd", err)
  170. continue
  171. }
  172. TimeDiffStart := baseTimeStart.Sub(srtTimeStart)
  173. TimeDiffEnd := baseTimeEnd.Sub(srtTimeEnd)
  174. startDiffTimeLineData = append(startDiffTimeLineData, opts.LineData{Value: TimeDiffStart.Seconds()})
  175. endDiffTimeLineData = append(endDiffTimeLineData, opts.LineData{Value: TimeDiffEnd.Seconds()})
  176. tmpStartDiffTime = append(tmpStartDiffTime, TimeDiffStart.Seconds())
  177. tmpEndDiffTime = append(tmpEndDiffTime, TimeDiffEnd.Seconds())
  178. startDiffTimeList = append(startDiffTimeList, TimeDiffStart.Seconds())
  179. endDiffTimeList = append(endDiffTimeList, TimeDiffEnd.Seconds())
  180. xAxis = append(xAxis, fmt.Sprintf("%d_%d", mIndex, i))
  181. //println(fmt.Sprintf("Diff Start-End: %s - %s Base[%d] %s-%s '%s' <--> Src[%d] %s-%s '%s'",
  182. // TimeDiffStart, TimeDiffEnd,
  183. // tmpBaseIndex, infoBase.DialoguesEx[tmpBaseIndex].StartTime, infoBase.DialoguesEx[tmpBaseIndex].EndTime, infoBase.DialoguesEx[tmpBaseIndex].EnLine,
  184. // tmpSrcIndex, infoSrc.DialoguesEx[tmpSrcIndex].StartTime, infoSrc.DialoguesEx[tmpSrcIndex].EndTime, infoSrc.DialoguesEx[tmpSrcIndex].EnLine))
  185. }
  186. //println("---------------------------------------------")
  187. }
  188. oldMean := stat.Mean(startDiffTimeList)
  189. oldSd := stat.Sd(startDiffTimeList)
  190. newMean := -1.0
  191. newSd := -1.0
  192. per := 1.0
  193. // 如果 SD 较大的时候才需要剔除
  194. if oldSd > 0.1 {
  195. var outliersMap = make(map[float64]int, 0)
  196. outliers, _, _ := tukey.Outliers(0.3, tmpStartDiffTime)
  197. for _, outlier := range outliers {
  198. outliersMap[outlier] = 0
  199. }
  200. var newStartDiffTimeList = make([]float64, 0)
  201. for _, f := range tmpStartDiffTime {
  202. _, ok := outliersMap[f]
  203. if ok == true {
  204. continue
  205. }
  206. newStartDiffTimeList = append(newStartDiffTimeList, f)
  207. }
  208. orgLen := startDiffTimeList.Len()
  209. startDiffTimeList = make(stat.Float64Slice, 0)
  210. for _, f := range newStartDiffTimeList {
  211. startDiffTimeList = append(startDiffTimeList, f)
  212. }
  213. newLen := startDiffTimeList.Len()
  214. per = float64(newLen) / float64(orgLen)
  215. newMean = stat.Mean(startDiffTimeList)
  216. newSd = stat.Sd(startDiffTimeList)
  217. }
  218. if newMean == -1.0 {
  219. newMean = oldMean
  220. }
  221. if newSd == -1.0 {
  222. newSd = oldSd
  223. }
  224. err = SaveStaticLine(staticLineFPath, infoBase.Name, infoSrc.Name,
  225. per, oldMean, oldSd, newMean, newSd, xAxis,
  226. startDiffTimeLineData, endDiffTimeLineData)
  227. if err != nil {
  228. return 0, err
  229. }
  230. return newMean, nil
  231. }
  232. // FixSubTimeline 校正时间轴
  233. func FixSubTimeline(infoSrc *subparser.FileInfo, offsetTime float64, desSaveSubFPath string) {
  234. /*
  235. 从解析的实例中,正常来说是可以匹配出所有的 Dialogue 对话的 Start 和 End time 的信息
  236. 然后找到对应的字幕的文件,进行文件内容的替换来做时间轴的校正
  237. */
  238. }
  239. const timeFormatAss = "15:04:05.00"
  240. const timeFormatSrt = "15:04:05,000"