fixer_test.go 8.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199
  1. package sub_timeline_fixer
  2. import (
  3. "fmt"
  4. "github.com/allanpk716/ChineseSubFinder/internal/logic/sub_parser/ass"
  5. "github.com/allanpk716/ChineseSubFinder/internal/logic/sub_parser/srt"
  6. "github.com/allanpk716/ChineseSubFinder/internal/pkg"
  7. "github.com/allanpk716/ChineseSubFinder/internal/pkg/sub_helper"
  8. "github.com/allanpk716/ChineseSubFinder/internal/pkg/sub_parser_hub"
  9. "github.com/james-bowman/nlp"
  10. "github.com/james-bowman/nlp/measures/pairwise"
  11. "gonum.org/v1/gonum/mat"
  12. "path"
  13. "strings"
  14. "testing"
  15. )
  16. func TestStopWordCounter(t *testing.T) {
  17. testDataPath := "../../../TestData/FixTimeline"
  18. testRootDir, err := pkg.CopyTestData(testDataPath)
  19. if err != nil {
  20. t.Fatal(err)
  21. }
  22. subParserHub := sub_parser_hub.NewSubParserHub(ass.NewParser(), srt.NewParser())
  23. bFind, info, err := subParserHub.DetermineFileTypeFromFile(path.Join(testRootDir, "R&M S05E10 - English.srt"))
  24. if err != nil {
  25. t.Fatal(err)
  26. }
  27. if bFind == false {
  28. t.Fatal("not match sub types")
  29. }
  30. allString := strings.Join(info.OtherLines, " ")
  31. stopWords := StopWordCounter(strings.ToLower(allString), 5)
  32. print(len(stopWords))
  33. println(info.Name)
  34. }
  35. func TestGetOffsetTime(t *testing.T) {
  36. testDataPath := "../../../TestData/FixTimeline"
  37. testRootDir, err := pkg.CopyTestData(testDataPath)
  38. if err != nil {
  39. t.Fatal(err)
  40. }
  41. subParserHub := sub_parser_hub.NewSubParserHub(ass.NewParser(), srt.NewParser())
  42. type args struct {
  43. enSubFile string
  44. ch_enSubFile string
  45. staticLineFileSavePath string
  46. }
  47. tests := []struct {
  48. name string
  49. args args
  50. want float64
  51. wantErr bool
  52. }{
  53. {name: "R&M S05E01", args: args{enSubFile: path.Join(testRootDir, "R&M S05E01 - English.srt"),
  54. ch_enSubFile: path.Join(testRootDir, "R&M S05E01 - 简英.srt"),
  55. staticLineFileSavePath: "bar.html"}, want: -6.42981818181818, wantErr: false},
  56. {name: "R&M S05E10", args: args{enSubFile: path.Join(testRootDir, "R&M S05E10 - English.ass"),
  57. ch_enSubFile: path.Join(testRootDir, "R&M S05E10 - 简英.ass"),
  58. staticLineFileSavePath: "bar.html"}, want: -6.335985401459854, wantErr: false},
  59. {name: "R&M S05E10-shooter", args: args{enSubFile: path.Join(testRootDir, "R&M S05E10 - English.ass"),
  60. ch_enSubFile: path.Join(testRootDir, "R&M S05E10 - 简英-shooter.ass"),
  61. staticLineFileSavePath: "bar.html"}, want: -6.335985401459854, wantErr: false},
  62. {name: "基地 S01E03", args: args{enSubFile: path.Join(testRootDir, "基地 S01E03 - English.ass"),
  63. ch_enSubFile: path.Join(testRootDir, "基地 S01E03 - 简英.ass"),
  64. staticLineFileSavePath: "bar.html"}, want: -32.09061538461539, wantErr: false},
  65. {name: "Dan Brown's The Lost Symbol - S01E01", args: args{
  66. enSubFile: path.Join(testRootDir, tmpSubDataFolderName, "Dan Brown's The Lost Symbol - S01E01 - As Above, So Below WEBDL-720p", "Dan Brown's The Lost Symbol - S01E01 - As Above, So Below WEBDL-720p.chinese(inside).ass"),
  67. ch_enSubFile: path.Join(testRootDir, tmpSubDataFolderName, "Dan Brown's The Lost Symbol - S01E01 - As Above, So Below WEBDL-720p", "Dan Brown's The Lost Symbol - S01E01 - As Above, So Below WEBDL-720p.chinese(简英,shooter).ass"),
  68. staticLineFileSavePath: "bar.html"},
  69. want: 1.3217821782178225, wantErr: false},
  70. {name: "Dan Brown's The Lost Symbol - S01E02", args: args{
  71. enSubFile: path.Join(testRootDir, tmpSubDataFolderName, "Dan Brown's The Lost Symbol - S01E02 - The Araf WEBDL-1080p", "Dan Brown's The Lost Symbol - S01E02 - The Araf WEBDL-1080p.chinese(inside).ass"),
  72. ch_enSubFile: path.Join(testRootDir, tmpSubDataFolderName, "Dan Brown's The Lost Symbol - S01E02 - The Araf WEBDL-1080p", "Dan Brown's The Lost Symbol - S01E02 - The Araf WEBDL-1080p.chinese(简英,subhd).ass"),
  73. staticLineFileSavePath: "bar.html"},
  74. want: -0.5253383458646617, wantErr: false},
  75. {name: "Dan Brown's The Lost Symbol - S01E03", args: args{
  76. enSubFile: path.Join(testRootDir, tmpSubDataFolderName, "Dan Brown's The Lost Symbol - S01E03 - Murmuration WEBDL-1080p", "Dan Brown's The Lost Symbol - S01E03 - Murmuration WEBDL-1080p.chinese(inside).ass"),
  77. ch_enSubFile: path.Join(testRootDir, tmpSubDataFolderName, "Dan Brown's The Lost Symbol - S01E03 - Murmuration WEBDL-1080p", "Dan Brown's The Lost Symbol - S01E03 - Murmuration WEBDL-1080p.chinese(简英,shooter).ass"),
  78. staticLineFileSavePath: "bar.html"},
  79. want: -0.505656, wantErr: false},
  80. {name: "Dan Brown's The Lost Symbol - S01E03", args: args{
  81. enSubFile: path.Join(testRootDir, tmpSubDataFolderName, "Dan Brown's The Lost Symbol - S01E03 - Murmuration WEBDL-1080p", "Dan Brown's The Lost Symbol - S01E03 - Murmuration WEBDL-1080p.chinese(inside).ass"),
  82. ch_enSubFile: path.Join(testRootDir, tmpSubDataFolderName, "Dan Brown's The Lost Symbol - S01E03 - Murmuration WEBDL-1080p", "Dan Brown's The Lost Symbol - S01E03 - Murmuration WEBDL-1080p.chinese(繁英,xunlei).ass"),
  83. staticLineFileSavePath: "bar.html"},
  84. want: -0.505656, wantErr: false},
  85. }
  86. for _, tt := range tests {
  87. t.Run(tt.name, func(t *testing.T) {
  88. bFind, infoBase, err := subParserHub.DetermineFileTypeFromFile(tt.args.enSubFile)
  89. if err != nil {
  90. t.Fatal(err)
  91. }
  92. if bFind == false {
  93. t.Fatal("sub not match")
  94. }
  95. /*
  96. 这里发现一个梗,内置的英文字幕导出的时候,有可能需要合并多个 Dialogue,见
  97. internal/pkg/sub_helper/sub_helper.go 中 MergeMultiDialogue4EngSubtitle 的实现
  98. */
  99. sub_helper.MergeMultiDialogue4EngSubtitle(infoBase)
  100. bFind, infoSrc, err := subParserHub.DetermineFileTypeFromFile(tt.args.ch_enSubFile)
  101. if err != nil {
  102. t.Fatal(err)
  103. }
  104. if bFind == false {
  105. t.Fatal("sub not match")
  106. }
  107. /*
  108. 这里发现一个梗,内置的英文字幕导出的时候,有可能需要合并多个 Dialogue,见
  109. internal/pkg/sub_helper/sub_helper.go 中 MergeMultiDialogue4EngSubtitle 的实现
  110. */
  111. sub_helper.MergeMultiDialogue4EngSubtitle(infoSrc)
  112. got, err := GetOffsetTime(infoBase, infoSrc, tt.args.ch_enSubFile+"-bar.html", tt.args.ch_enSubFile+".log")
  113. if (err != nil) != tt.wantErr {
  114. t.Errorf("GetOffsetTime() error = %v, wantErr %v", err, tt.wantErr)
  115. return
  116. }
  117. // 在一个正负范围内都可以接受
  118. if got > tt.want-0.1 && got < tt.want+0.1 {
  119. } else {
  120. t.Errorf("GetOffsetTime() got = %v, want %v", got, tt.want)
  121. }
  122. //if got != tt.want {
  123. // t.Errorf("GetOffsetTime() got = %v, want %v", got, tt.want)
  124. //}
  125. println(fmt.Sprintf("GetOffsetTime: %fs", got))
  126. })
  127. }
  128. }
  129. func TestTFIDF(t *testing.T) {
  130. testCorpus := []string{
  131. "The quick brown fox jumped over the lazy dog",
  132. "hey diddle diddle, the cat and the fiddle",
  133. "the cow jumped over the moon",
  134. "the little dog laughed to see such fun",
  135. "and the dish ran away with the spoon",
  136. }
  137. query := "the brown fox ran around the dog"
  138. vectoriser := nlp.NewCountVectoriser(StopWords...)
  139. transformer := nlp.NewTfidfTransformer()
  140. // set k (the number of dimensions following truncation) to 4
  141. reducer := nlp.NewTruncatedSVD(4)
  142. lsiPipeline := nlp.NewPipeline(vectoriser, transformer, reducer)
  143. // Transform the corpus into an LSI fitting the model to the documents in the process
  144. lsi, err := lsiPipeline.FitTransform(testCorpus...)
  145. if err != nil {
  146. fmt.Printf("Failed to process documents because %v", err)
  147. return
  148. }
  149. // run the query through the same pipeline that was fitted to the corpus and
  150. // to project it into the same dimensional space
  151. queryVector, err := lsiPipeline.Transform(query)
  152. if err != nil {
  153. fmt.Printf("Failed to process documents because %v", err)
  154. return
  155. }
  156. // iterate over document feature vectors (columns) in the LSI matrix and compare
  157. // with the query vector for similarity. Similarity is determined by the difference
  158. // between the angles of the vectors known as the cosine similarity
  159. highestSimilarity := -1.0
  160. var matched int
  161. _, docs := lsi.Dims()
  162. for i := 0; i < docs; i++ {
  163. similarity := pairwise.CosineSimilarity(queryVector.(mat.ColViewer).ColView(0), lsi.(mat.ColViewer).ColView(i))
  164. if similarity > highestSimilarity {
  165. matched = i
  166. highestSimilarity = similarity
  167. }
  168. }
  169. fmt.Printf("Matched '%s'", testCorpus[matched])
  170. // Output: Matched 'The quick brown fox jumped over the lazy dog'
  171. }
  172. const tmpSubDataFolderName = "SubFixCache"