fixer_test.go 3.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134
  1. package sub_timeline_fixer
  2. import (
  3. "fmt"
  4. "github.com/allanpk716/ChineseSubFinder/internal/logic/sub_parser/ass"
  5. "github.com/allanpk716/ChineseSubFinder/internal/logic/sub_parser/srt"
  6. "github.com/allanpk716/ChineseSubFinder/internal/pkg"
  7. "github.com/allanpk716/ChineseSubFinder/internal/pkg/sub_parser_hub"
  8. "github.com/james-bowman/nlp"
  9. "github.com/james-bowman/nlp/measures/pairwise"
  10. "gonum.org/v1/gonum/mat"
  11. "path"
  12. "strings"
  13. "testing"
  14. )
  15. func TestStopWordCounter(t *testing.T) {
  16. testDataPath := "../../../TestData/FixTimeline"
  17. testRootDir, err := pkg.CopyTestData(testDataPath)
  18. if err != nil {
  19. t.Fatal(err)
  20. }
  21. subParserHub := sub_parser_hub.NewSubParserHub(ass.NewParser(), srt.NewParser())
  22. bFind, info, err := subParserHub.DetermineFileTypeFromFile(path.Join(testRootDir, "R&M S05E10 - English.srt"))
  23. if err != nil {
  24. t.Fatal(err)
  25. }
  26. if bFind == false {
  27. t.Fatal("not match sub types")
  28. }
  29. allString := strings.Join(info.OtherLines, " ")
  30. stopWords := StopWordCounter(strings.ToLower(allString), 5)
  31. print(len(stopWords))
  32. println(info.Name)
  33. }
  34. func TestGetOffsetTime(t *testing.T) {
  35. testDataPath := "../../../TestData/FixTimeline"
  36. testRootDir, err := pkg.CopyTestData(testDataPath)
  37. if err != nil {
  38. t.Fatal(err)
  39. }
  40. enSubFile := path.Join(testRootDir, "R&M S05E01 - English.srt")
  41. ch_enSubFile := path.Join(testRootDir, "R&M S05E01 - 简英.srt")
  42. //enSubFile := path.Join(testRootDir, "R&M S05E10 - English.ass")
  43. //ch_enSubFile := path.Join(testRootDir, "R&M S05E10 - 简英.ass")
  44. //ch_enSubFile := path.Join(testRootDir, "R&M S05E10 - 简英-shooter.ass")
  45. //enSubFile := path.Join(testRootDir, "基地 S01E03 - English.ass")
  46. //ch_enSubFile := path.Join(testRootDir, "基地 S01E03 - 简英.ass")
  47. subParserHub := sub_parser_hub.NewSubParserHub(ass.NewParser(), srt.NewParser())
  48. bFind, infoBase, err := subParserHub.DetermineFileTypeFromFile(enSubFile)
  49. if err != nil {
  50. t.Fatal(err)
  51. }
  52. if bFind == false {
  53. t.Fatal("sub not match")
  54. }
  55. bFind, infoSrc, err := subParserHub.DetermineFileTypeFromFile(ch_enSubFile)
  56. if err != nil {
  57. t.Fatal(err)
  58. }
  59. if bFind == false {
  60. t.Fatal("sub not match")
  61. }
  62. time, err := GetOffsetTime(infoBase, infoSrc, "")
  63. if err != nil {
  64. t.Fatal(err)
  65. }
  66. println(fmt.Sprintf("GetOffsetTime: %fs", time))
  67. }
  68. func TestTFIDF(t *testing.T) {
  69. testCorpus := []string{
  70. "The quick brown fox jumped over the lazy dog",
  71. "hey diddle diddle, the cat and the fiddle",
  72. "the cow jumped over the moon",
  73. "the little dog laughed to see such fun",
  74. "and the dish ran away with the spoon",
  75. }
  76. query := "the brown fox ran around the dog"
  77. vectoriser := nlp.NewCountVectoriser(StopWords...)
  78. transformer := nlp.NewTfidfTransformer()
  79. // set k (the number of dimensions following truncation) to 4
  80. reducer := nlp.NewTruncatedSVD(4)
  81. lsiPipeline := nlp.NewPipeline(vectoriser, transformer, reducer)
  82. // Transform the corpus into an LSI fitting the model to the documents in the process
  83. lsi, err := lsiPipeline.FitTransform(testCorpus...)
  84. if err != nil {
  85. fmt.Printf("Failed to process documents because %v", err)
  86. return
  87. }
  88. // run the query through the same pipeline that was fitted to the corpus and
  89. // to project it into the same dimensional space
  90. queryVector, err := lsiPipeline.Transform(query)
  91. if err != nil {
  92. fmt.Printf("Failed to process documents because %v", err)
  93. return
  94. }
  95. // iterate over document feature vectors (columns) in the LSI matrix and compare
  96. // with the query vector for similarity. Similarity is determined by the difference
  97. // between the angles of the vectors known as the cosine similarity
  98. highestSimilarity := -1.0
  99. var matched int
  100. _, docs := lsi.Dims()
  101. for i := 0; i < docs; i++ {
  102. similarity := pairwise.CosineSimilarity(queryVector.(mat.ColViewer).ColView(0), lsi.(mat.ColViewer).ColView(i))
  103. if similarity > highestSimilarity {
  104. matched = i
  105. highestSimilarity = similarity
  106. }
  107. }
  108. fmt.Printf("Matched '%s'", testCorpus[matched])
  109. // Output: Matched 'The quick brown fox jumped over the lazy dog'
  110. }