whatlanggo.go 6.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225
  1. package language
  2. import (
  3. "github.com/abadojack/whatlanggo"
  4. "github.com/allanpk716/ChineseSubFinder/internal/types/language"
  5. "github.com/allanpk716/ChineseSubFinder/internal/types/subparser"
  6. )
  7. // GetLangOptions 语言识别的 Options Whitelist
  8. func GetLangOptions() whatlanggo.Options {
  9. return whatlanggo.Options{
  10. Whitelist: map[whatlanggo.Lang]bool{
  11. whatlanggo.Cmn: true, // 中文 11
  12. whatlanggo.Eng: true, // 英文 15
  13. whatlanggo.Jpn: true, // 日文 32
  14. whatlanggo.Kor: true, // 韩文 37
  15. },
  16. }
  17. }
  18. // IsWhiteListLang 是否是白名单语言
  19. func IsWhiteListLang(lang whatlanggo.Lang) bool {
  20. switch lang {
  21. // 中文 英文 日文 韩文
  22. case whatlanggo.Cmn, whatlanggo.Eng, whatlanggo.Jpn, whatlanggo.Kor:
  23. return true
  24. default:
  25. return false
  26. }
  27. }
  28. // DetectSubLangAndStatistics 检测语言然后统计
  29. func DetectSubLangAndStatistics(oneDialogue subparser.OneDialogue, langDict map[int]int, usefulDialoguseEx *[]subparser.OneDialogueEx, chLines *[]string, otherLines *[]string) {
  30. var oneDialogueEx subparser.OneDialogueEx
  31. oneDialogueEx.StartTime = oneDialogue.StartTime
  32. oneDialogueEx.EndTime = oneDialogue.EndTime
  33. for _, line := range oneDialogue.Lines {
  34. info := whatlanggo.DetectWithOptions(line, GetLangOptions())
  35. tmpLang := -1
  36. if IsWhiteListLang(info.Lang) == true {
  37. tmpLang = (int)(info.Lang)
  38. }
  39. // 这一种语言的 key 是否存在,不存在则新建,存在再数值 +1
  40. value, ok := langDict[tmpLang]
  41. if ok == true {
  42. // 累加
  43. value++
  44. langDict[tmpLang] = value
  45. } else {
  46. langDict[tmpLang] = 1
  47. }
  48. // 统计中文有多少行
  49. if info.Lang == whatlanggo.Cmn {
  50. *chLines = append(*chLines, line)
  51. } else {
  52. *otherLines = append(*otherLines, line)
  53. }
  54. // 这里可能是一个 dialogue 里面有两句话,而且两句话都是一个类型的语言,所以其实需要的是合并
  55. switch info.Lang {
  56. case whatlanggo.Cmn:
  57. oneDialogueEx.ChLine += line + " "
  58. case whatlanggo.Eng:
  59. oneDialogueEx.EnLine += line + " "
  60. case whatlanggo.Kor:
  61. oneDialogueEx.KrLine += line + " "
  62. case whatlanggo.Jpn:
  63. oneDialogueEx.JpLine += line + " "
  64. }
  65. }
  66. *usefulDialoguseEx = append(*usefulDialoguseEx, oneDialogueEx)
  67. }
  68. // SubLangStatistics2SubLangType 由分析的信息转换为具体是什么字幕的语言类型
  69. func SubLangStatistics2SubLangType(countLineFeed, AllLines float32, langDict map[int]int, chLines []string) language.MyLanguage {
  70. const basePer = 0.8
  71. // 是否是双语?
  72. isDouble := false
  73. perLines := countLineFeed / AllLines
  74. // 第二行字幕出现的概率大于 80% 应该稳了吧,不然还能三语?
  75. if perLines > basePer {
  76. isDouble = true
  77. }
  78. // 中文(包含了 chs 以及 cht,这一级是无法区分的,需要额外的简体和繁体区分方法)
  79. countChinese, hasChinese := langDict[int(whatlanggo.Cmn)]
  80. // 英文
  81. countEnglish, hasEnglish := langDict[int(whatlanggo.Eng)]
  82. // 日文
  83. countJapanese, hasJapanese := langDict[int(whatlanggo.Jpn)]
  84. // 韩文
  85. countKorean, hasKorean := langDict[int(whatlanggo.Kor)]
  86. // 0 - No , 1 - Chs, 2 - Cht
  87. isNoOrChsOrCht := 0
  88. isChsCount := 0
  89. if hasChinese {
  90. for _, line := range chLines {
  91. // 判断是简体还是繁体
  92. if chDict.IsChs(line, 0.9) == true {
  93. isChsCount++
  94. }
  95. }
  96. // 简体句子的占比超过 80%
  97. if float32(isChsCount)/float32(len(chLines)) > 0.8 {
  98. isNoOrChsOrCht = 1
  99. } else {
  100. isNoOrChsOrCht = 2
  101. }
  102. }
  103. // 这里有一种情况,就是双语的字幕不是在一个时间轴上的,而是分成两个时间轴的
  104. // 那么之前的 isDouble 判断就失效了,需要补判一次
  105. if isDouble == false {
  106. if hasChinese && hasEnglish {
  107. isDouble = isDoubleLang(countChinese, countEnglish)
  108. }
  109. if hasChinese && hasJapanese {
  110. isDouble = isDoubleLang(countChinese, countJapanese)
  111. }
  112. if hasChinese && hasKorean {
  113. isDouble = isDoubleLang(countChinese, countKorean)
  114. }
  115. }
  116. // 优先判断双语
  117. if isDouble == true {
  118. // 首先得在外面统计就知道是双语
  119. if hasChinese && hasEnglish {
  120. // 简体 英文
  121. return chIsChsOrCht(language.ChineseSimpleEnglish, isNoOrChsOrCht)
  122. } else if hasChinese && hasJapanese {
  123. // 简体 日文
  124. return chIsChsOrCht(language.ChineseSimpleJapanese, isNoOrChsOrCht)
  125. } else if hasChinese && hasKorean {
  126. // 简体 韩文
  127. return chIsChsOrCht(language.ChineseSimpleKorean, isNoOrChsOrCht)
  128. } else if hasChinese {
  129. return chIsChsOrCht(language.ChineseSimple, isNoOrChsOrCht)
  130. } else if hasEnglish {
  131. return language.English
  132. } else if hasJapanese {
  133. return language.Japanese
  134. } else if hasKorean {
  135. return language.Korean
  136. } else {
  137. return language.Unknown
  138. }
  139. } else {
  140. // 如果比例达不到,那么就是单语言,所以最多的那个就是当前的语言
  141. // 这里的字典是有可能出现
  142. if hasChinese {
  143. // 那么起码要占比 80% 对吧
  144. perLines = float32(countChinese) / AllLines
  145. if perLines > basePer {
  146. return chIsChsOrCht(language.ChineseSimple, isNoOrChsOrCht)
  147. }
  148. }
  149. if hasEnglish {
  150. // 那么起码要占比 80% 对吧
  151. perLines = float32(countEnglish) / AllLines
  152. if perLines > basePer {
  153. return language.English
  154. }
  155. }
  156. if hasJapanese {
  157. // 那么起码要占比 80% 对吧
  158. perLines = float32(countJapanese) / AllLines
  159. if perLines > basePer {
  160. return language.Japanese
  161. }
  162. }
  163. if hasKorean {
  164. // 那么起码要占比 80% 对吧
  165. perLines = float32(countKorean) / AllLines
  166. if perLines > basePer {
  167. return language.Korean
  168. }
  169. }
  170. return language.Unknown
  171. }
  172. }
  173. // 跟中文相关的再使用,其他的无需传入
  174. func chIsChsOrCht(inLanguage language.MyLanguage, isNoOrChsOrCht int) language.MyLanguage {
  175. // 输出原来的
  176. if isNoOrChsOrCht == 0 || isNoOrChsOrCht == 1 {
  177. return inLanguage
  178. }
  179. switch inLanguage {
  180. case language.ChineseSimpleEnglish:
  181. // 简体 英文
  182. return language.ChineseTraditionalEnglish
  183. case language.ChineseSimpleJapanese:
  184. // 简体 日文
  185. return language.ChineseTraditionalJapanese
  186. case language.ChineseSimpleKorean:
  187. // 简体 韩文
  188. return language.ChineseTraditionalKorean
  189. case language.ChineseSimple:
  190. // 简体
  191. return language.ChineseTraditional
  192. default:
  193. return inLanguage
  194. }
  195. }
  196. func isDoubleLang(count0, count1 int) bool {
  197. if count0 >= count1 {
  198. f := float32(count0) / float32(count1)
  199. if f >= 1 && f <= 1.4 {
  200. return true
  201. } else {
  202. return false
  203. }
  204. } else {
  205. f := float32(count1) / float32(count0)
  206. if f >= 1 && f <= 1.4 {
  207. return true
  208. } else {
  209. return false
  210. }
  211. }
  212. }