| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282 |
- package sub_timeline_fixer
- import (
- "errors"
- "fmt"
- "github.com/allanpk716/ChineseSubFinder/internal/common"
- "github.com/allanpk716/ChineseSubFinder/internal/logic/sub_parser/ass"
- "github.com/allanpk716/ChineseSubFinder/internal/logic/sub_parser/srt"
- "github.com/allanpk716/ChineseSubFinder/internal/pkg/sub_parser_hub"
- "github.com/go-echarts/go-echarts/v2/opts"
- "github.com/grd/stat"
- "github.com/james-bowman/nlp"
- "github.com/james-bowman/nlp/measures/pairwise"
- "github.com/mndrix/tukey"
- "gonum.org/v1/gonum/mat"
- "strings"
- "time"
- )
- // StopWordCounter 停止词统计
- func StopWordCounter(inString string, per int) []string {
- statisticTimes := make(map[string]int)
- wordsLength := strings.Fields(inString)
- for counts, word := range wordsLength {
- // 判断key是否存在,这个word是字符串,这个counts是统计的word的次数。
- word, ok := statisticTimes[word]
- if ok {
- word = word
- statisticTimes[wordsLength[counts]] = statisticTimes[wordsLength[counts]] + 1
- } else {
- statisticTimes[wordsLength[counts]] = 1
- }
- }
- stopWords := make([]string, 0)
- mapByValue := sortMapByValue(statisticTimes)
- breakIndex := len(mapByValue) * per / 100
- for index, wordInfo := range mapByValue {
- if index > breakIndex {
- break
- }
- stopWords = append(stopWords, wordInfo.Name)
- }
- return stopWords
- }
- // NewTFIDF 初始化 TF-IDF
- func NewTFIDF(testCorpus []string) (*nlp.Pipeline, mat.Matrix, error) {
- newCountVectoriser := nlp.NewCountVectoriser(StopWords...)
- transformer := nlp.NewTfidfTransformer()
- // set k (the number of dimensions following truncation) to 4
- reducer := nlp.NewTruncatedSVD(4)
- lsiPipeline := nlp.NewPipeline(newCountVectoriser, transformer, reducer)
- // Transform the corpus into an LSI fitting the model to the documents in the process
- lsi, err := lsiPipeline.FitTransform(testCorpus...)
- if err != nil {
- return nil, lsi, errors.New(fmt.Sprintf("Failed to process testCorpus documents because %v", err))
- }
- return lsiPipeline, lsi, nil
- }
- // GetOffsetTime 暂时只支持英文的基准字幕,源字幕必须是双语中英字幕
- func GetOffsetTime(baseEngSubFPath, srcSubFPath string) (time.Duration, error) {
- subParserHub := sub_parser_hub.NewSubParserHub(ass.NewParser(), srt.NewParser())
- bFind, infoBase, err := subParserHub.DetermineFileTypeFromFile(baseEngSubFPath)
- if err != nil {
- return 0, err
- }
- if bFind == false {
- return 0, nil
- }
- bFind, infoSrc, err := subParserHub.DetermineFileTypeFromFile(srcSubFPath)
- if err != nil {
- return 0, err
- }
- if bFind == false {
- return 0, nil
- }
- // 构建基准语料库,目前阶段只需要考虑是 En 的就行了
- var baseCorpus = make([]string, 0)
- for _, oneDialogueEx := range infoBase.DialoguesEx {
- baseCorpus = append(baseCorpus, oneDialogueEx.EnLine)
- }
- // 初始化
- pipLine, tfidf, err := NewTFIDF(baseCorpus)
- if err != nil {
- return 0, err
- }
- /*
- 确认两个字幕间的偏移,暂定的方案是两边都连续匹配上 5 个索引,再抽取一个对话的时间进行修正计算
- */
- maxCompareDialogue := 5
- // 基线的长度
- _, docsLength := tfidf.Dims()
- var matchIndexList = make([]MatchIndex, 0)
- sc := NewSubCompare(maxCompareDialogue)
- // 开始比较相似度,默认认为是 Ch_en 就行了
- for srcIndex, srcOneDialogueEx := range infoSrc.DialoguesEx {
- // 这里只考虑 英文 的语言
- if srcOneDialogueEx.EnLine == "" {
- continue
- }
- // run the query through the same pipeline that was fitted to the corpus and
- // to project it into the same dimensional space
- queryVector, err := pipLine.Transform(srcOneDialogueEx.EnLine)
- if err != nil {
- return 0, err
- }
- // iterate over document feature vectors (columns) in the LSI matrix and compare
- // with the query vector for similarity. Similarity is determined by the difference
- // between the angles of the vectors known as the cosine similarity
- highestSimilarity := -1.0
- // 匹配上的基准的索引
- var baseIndex int
- // 这里理论上需要把所有的基线遍历一次,但是,一般来说,两个字幕不可能差距在 50 行
- // 这样的好处是有助于提高搜索的性能
- // 那么就以当前的 src 的位置,向前、向后各 50 来遍历
- nowMaxScanLength := srcIndex + 50
- nowMinScanLength := srcIndex - 50
- if nowMinScanLength < 0 {
- nowMinScanLength = 0
- }
- if nowMaxScanLength > docsLength {
- nowMaxScanLength = docsLength
- }
- for i := nowMinScanLength; i < nowMaxScanLength; i++ {
- similarity := pairwise.CosineSimilarity(queryVector.(mat.ColViewer).ColView(0), tfidf.(mat.ColViewer).ColView(i))
- if similarity > highestSimilarity {
- baseIndex = i
- highestSimilarity = similarity
- }
- }
- if sc.Add(baseIndex, srcIndex) == false {
- sc.Clear()
- sc.Add(baseIndex, srcIndex)
- }
- if sc.Check() == false {
- continue
- }
- startBaseIndex, startSrcIndex := sc.GetStartIndex()
- matchIndexList = append(matchIndexList, MatchIndex{
- BaseNowIndex: startBaseIndex,
- SrcNowIndex: startSrcIndex,
- Similarity: highestSimilarity,
- })
- //println(fmt.Sprintf("Similarity: %f Base[%d] %s-%s '%s' <--> Src[%d] %s-%s '%s'",
- // highestSimilarity,
- // baseIndex, infoBase.DialoguesEx[baseIndex].StartTime, infoBase.DialoguesEx[baseIndex].EndTime, baseCorpus[baseIndex],
- // srcIndex, srcOneDialogueEx.StartTime, srcOneDialogueEx.EndTime, srcOneDialogueEx.EnLine))
- }
- timeFormat := ""
- if infoBase.Ext == common.SubExtASS || infoBase.Ext == common.SubExtSSA {
- timeFormat = timeFormatAss
- } else {
- timeFormat = timeFormatSrt
- }
- var startDiffTimeLineData = make([]opts.LineData, 0)
- var endDiffTimeLineData = make([]opts.LineData, 0)
- var tmpStartDiffTime = make([]float64, 0)
- var tmpEndDiffTime = make([]float64, 0)
- var startDiffTimeList = make(stat.Float64Slice, 0)
- var endDiffTimeList = make(stat.Float64Slice, 0)
- var xAxis = make([]string, 0)
- // 上面找出了连续匹配 maxCompareDialogue:N 次的字幕语句块
- // 求出平均时间偏移
- for mIndex, matchIndexItem := range matchIndexList {
- for i := 0; i < maxCompareDialogue; i++ {
- // 这里会统计连续的这 5 句话的时间差
- tmpBaseIndex := matchIndexItem.BaseNowIndex + i
- tmpSrcIndex := matchIndexItem.SrcNowIndex + i
- baseTimeStart, err := time.Parse(timeFormat, infoBase.DialoguesEx[tmpBaseIndex].StartTime)
- if err != nil {
- println("baseTimeStart", err)
- continue
- }
- baseTimeEnd, err := time.Parse(timeFormat, infoBase.DialoguesEx[tmpBaseIndex].EndTime)
- if err != nil {
- println("baseTimeEnd", err)
- continue
- }
- srtTimeStart, err := time.Parse(timeFormat, infoSrc.DialoguesEx[tmpSrcIndex].StartTime)
- if err != nil {
- println("srtTimeStart", err)
- continue
- }
- srtTimeEnd, err := time.Parse(timeFormat, infoSrc.DialoguesEx[tmpSrcIndex].EndTime)
- if err != nil {
- println("srtTimeEnd", err)
- continue
- }
- TimeDiffStart := baseTimeStart.Sub(srtTimeStart)
- TimeDiffEnd := baseTimeEnd.Sub(srtTimeEnd)
- startDiffTimeLineData = append(startDiffTimeLineData, opts.LineData{Value: TimeDiffStart.Seconds()})
- endDiffTimeLineData = append(endDiffTimeLineData, opts.LineData{Value: TimeDiffEnd.Seconds()})
- tmpStartDiffTime = append(tmpStartDiffTime, TimeDiffStart.Seconds())
- tmpEndDiffTime = append(tmpEndDiffTime, TimeDiffEnd.Seconds())
- startDiffTimeList = append(startDiffTimeList, TimeDiffStart.Seconds())
- endDiffTimeList = append(endDiffTimeList, TimeDiffEnd.Seconds())
- xAxis = append(xAxis, fmt.Sprintf("%d_%d", mIndex, i))
- //println(fmt.Sprintf("Diff Start-End: %s - %s Base[%d] %s-%s '%s' <--> Src[%d] %s-%s '%s'",
- // TimeDiffStart, TimeDiffEnd,
- // tmpBaseIndex, infoBase.DialoguesEx[tmpBaseIndex].StartTime, infoBase.DialoguesEx[tmpBaseIndex].EndTime, infoBase.DialoguesEx[tmpBaseIndex].EnLine,
- // tmpSrcIndex, infoSrc.DialoguesEx[tmpSrcIndex].StartTime, infoSrc.DialoguesEx[tmpSrcIndex].EndTime, infoSrc.DialoguesEx[tmpSrcIndex].EnLine))
- }
- //println("---------------------------------------------")
- }
- oldMean := stat.Mean(startDiffTimeList)
- oldSd := stat.Sd(startDiffTimeList)
- newMean := -1.0
- newSd := -1.0
- per := 1.0
- // 如果 SD 较大的时候才需要剔除
- if oldSd > 0.1 {
- var outliersMap = make(map[float64]int, 0)
- outliers, _, _ := tukey.Outliers(0.1, tmpStartDiffTime)
- for _, outlier := range outliers {
- outliersMap[outlier] = 0
- }
- var newStartDiffTimeList = make([]float64, 0)
- for _, f := range tmpStartDiffTime {
- _, ok := outliersMap[f]
- if ok == true {
- continue
- }
- newStartDiffTimeList = append(newStartDiffTimeList, f)
- }
- orgLen := startDiffTimeList.Len()
- startDiffTimeList = make(stat.Float64Slice, 0)
- for _, f := range newStartDiffTimeList {
- startDiffTimeList = append(startDiffTimeList, f)
- }
- newLen := startDiffTimeList.Len()
- per = float64(newLen) / float64(orgLen)
- newMean = stat.Mean(startDiffTimeList)
- newSd = stat.Sd(startDiffTimeList)
- }
- if newMean == -1.0 {
- newMean = oldMean
- }
- if newSd == -1.0 {
- newSd = oldSd
- }
- err = SaveStaticLine("bar.html", infoBase.Name, infoSrc.Name,
- per, oldMean, oldSd, newMean, newSd, xAxis,
- startDiffTimeLineData, endDiffTimeLineData)
- if err != nil {
- return 0, err
- }
- return 0, nil
- }
- const timeFormatAss = "15:04:05.00"
- const timeFormatSrt = "15:04:05,000"
|