fixer_test.go 3.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118
  1. package sub_timeline_fixer
  2. import (
  3. "fmt"
  4. "github.com/allanpk716/ChineseSubFinder/internal/logic/sub_parser/ass"
  5. "github.com/allanpk716/ChineseSubFinder/internal/logic/sub_parser/srt"
  6. "github.com/allanpk716/ChineseSubFinder/internal/pkg"
  7. "github.com/allanpk716/ChineseSubFinder/internal/pkg/sub_parser_hub"
  8. "github.com/james-bowman/nlp"
  9. "github.com/james-bowman/nlp/measures/pairwise"
  10. "gonum.org/v1/gonum/mat"
  11. "path"
  12. "strings"
  13. "testing"
  14. )
  15. func TestStopWordCounter(t *testing.T) {
  16. testDataPath := "../../../TestData/FixTimeline"
  17. testRootDir, err := pkg.CopyTestData(testDataPath)
  18. if err != nil {
  19. t.Fatal(err)
  20. }
  21. subParserHub := sub_parser_hub.NewSubParserHub(ass.NewParser(), srt.NewParser())
  22. bFind, info, err := subParserHub.DetermineFileTypeFromFile(path.Join(testRootDir, "R&M S05E10 - English.srt"))
  23. if err != nil {
  24. t.Fatal(err)
  25. }
  26. if bFind == false {
  27. t.Fatal("not match sub types")
  28. }
  29. allString := strings.Join(info.OtherLines, " ")
  30. stopWords := StopWordCounter(strings.ToLower(allString), 5)
  31. print(len(stopWords))
  32. println(info.Name)
  33. }
  34. func TestGetOffsetTime(t *testing.T) {
  35. testDataPath := "../../../TestData/FixTimeline"
  36. testRootDir, err := pkg.CopyTestData(testDataPath)
  37. if err != nil {
  38. t.Fatal(err)
  39. }
  40. //enSubFile := path.Join(testRootDir, "R&M S05E01 - English.srt")
  41. //ch_enSubFile := path.Join(testRootDir, "R&M S05E01 - 简英.srt")
  42. //enSubFile := path.Join(testRootDir, "R&M S05E10 - English.ass")
  43. //ch_enSubFile := path.Join(testRootDir, "R&M S05E10 - 简英.ass")
  44. //ch_enSubFile := path.Join(testRootDir, "R&M S05E10 - 简英-shooter.ass")
  45. enSubFile := path.Join(testRootDir, "基地 S01E03 - English.ass")
  46. ch_enSubFile := path.Join(testRootDir, "基地 S01E03 - 简英.ass")
  47. time, err := GetOffsetTime(enSubFile, ch_enSubFile)
  48. if err != nil {
  49. return
  50. }
  51. print(time)
  52. }
  53. func TestTFIDF(t *testing.T) {
  54. testCorpus := []string{
  55. "The quick brown fox jumped over the lazy dog",
  56. "hey diddle diddle, the cat and the fiddle",
  57. "the cow jumped over the moon",
  58. "the little dog laughed to see such fun",
  59. "and the dish ran away with the spoon",
  60. }
  61. query := "the brown fox ran around the dog"
  62. vectoriser := nlp.NewCountVectoriser(StopWords...)
  63. transformer := nlp.NewTfidfTransformer()
  64. // set k (the number of dimensions following truncation) to 4
  65. reducer := nlp.NewTruncatedSVD(4)
  66. lsiPipeline := nlp.NewPipeline(vectoriser, transformer, reducer)
  67. // Transform the corpus into an LSI fitting the model to the documents in the process
  68. lsi, err := lsiPipeline.FitTransform(testCorpus...)
  69. if err != nil {
  70. fmt.Printf("Failed to process documents because %v", err)
  71. return
  72. }
  73. // run the query through the same pipeline that was fitted to the corpus and
  74. // to project it into the same dimensional space
  75. queryVector, err := lsiPipeline.Transform(query)
  76. if err != nil {
  77. fmt.Printf("Failed to process documents because %v", err)
  78. return
  79. }
  80. // iterate over document feature vectors (columns) in the LSI matrix and compare
  81. // with the query vector for similarity. Similarity is determined by the difference
  82. // between the angles of the vectors known as the cosine similarity
  83. highestSimilarity := -1.0
  84. var matched int
  85. _, docs := lsi.Dims()
  86. for i := 0; i < docs; i++ {
  87. similarity := pairwise.CosineSimilarity(queryVector.(mat.ColViewer).ColView(0), lsi.(mat.ColViewer).ColView(i))
  88. if similarity > highestSimilarity {
  89. matched = i
  90. highestSimilarity = similarity
  91. }
  92. }
  93. fmt.Printf("Matched '%s'", testCorpus[matched])
  94. // Output: Matched 'The quick brown fox jumped over the lazy dog'
  95. }