lang.go 7.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273
  1. package common
  2. import (
  3. "github.com/abadojack/whatlanggo"
  4. "strings"
  5. )
  6. // LangConverter 语言转换器
  7. func LangConverter(subLang string) Language {
  8. /*
  9. xunlei:未知语言、简体&英语、繁体&英语、简体、繁体、英语
  10. */
  11. if strings.Contains(subLang, MatchLangChs) {
  12. // 优先简体
  13. if strings.Contains(subLang, MatchLangEn) {
  14. // 简英
  15. return ChineseSimpleEnglish
  16. } else if strings.Contains(subLang, MatchLangJp) {
  17. // 简日
  18. return ChineseSimpleJapanese
  19. } else if strings.Contains(subLang, MatchLangKr) {
  20. // 简韩
  21. return ChineseSimpleKorean
  22. }
  23. // 默认简体中文
  24. return ChineseSimple
  25. } else if strings.Contains(subLang, MatchLangCht) {
  26. // 然后是繁体
  27. if strings.Contains(subLang, MatchLangEn) {
  28. // 繁英
  29. return ChineseTraditionalEnglish
  30. } else if strings.Contains(subLang, MatchLangJp) {
  31. // 繁日
  32. return ChineseTraditionalJapanese
  33. } else if strings.Contains(subLang, MatchLangKr) {
  34. // 繁韩
  35. return ChineseTraditionalKorean
  36. }
  37. // 默认繁体中文
  38. return ChineseTraditional
  39. } else if strings.Contains(subLang, MatchLangEn) {
  40. // 英文
  41. return English
  42. } else if strings.Contains(subLang, MatchLangJp) {
  43. // 日文
  44. return Japanese
  45. } else if strings.Contains(subLang, MatchLangKr) {
  46. // 韩文
  47. return Korean
  48. } else {
  49. // 都没有,则标记未知
  50. return Unknow
  51. }
  52. }
  53. // HasChineseLang 是否包含中文
  54. func HasChineseLang(lan Language) bool {
  55. switch lan {
  56. case ChineseSimple,
  57. ChineseTraditional,
  58. ChineseSimpleEnglish,
  59. ChineseTraditionalEnglish,
  60. ChineseSimpleJapanese,
  61. ChineseTraditionalJapanese,
  62. ChineseSimpleKorean,
  63. ChineseTraditionalKorean:
  64. return true
  65. default:
  66. return false
  67. }
  68. }
  69. // GetLangOptions 语言识别的 Options Whitelist
  70. func GetLangOptions() whatlanggo.Options {
  71. return whatlanggo.Options{
  72. Whitelist: map[whatlanggo.Lang]bool{
  73. whatlanggo.Cmn: true, // 中文 11
  74. whatlanggo.Eng: true, // 英文 15
  75. whatlanggo.Jpn: true, // 日文 32
  76. whatlanggo.Kor: true, // 韩文 37
  77. },
  78. }
  79. }
  80. // IsWhiteListLang 是否是白名单语言
  81. func IsWhiteListLang(lang whatlanggo.Lang) bool {
  82. switch lang {
  83. // 中文 英文 日文 韩文
  84. case whatlanggo.Cmn, whatlanggo.Eng,whatlanggo.Jpn,whatlanggo.Kor:
  85. return true
  86. default:
  87. return false
  88. }
  89. }
  90. // DetectSubLangAndStatistics 检测语言然后统计
  91. func DetectSubLangAndStatistics(lines []string, langDict map[int]int) {
  92. for _, line := range lines {
  93. info := whatlanggo.DetectWithOptions(line, GetLangOptions())
  94. tmpLang := -1
  95. if IsWhiteListLang(info.Lang) == true {
  96. tmpLang = (int)(info.Lang)
  97. }
  98. // 这一种语言的 key 是否存在,不存在则新建,存在再数值 +1
  99. value, ok := langDict[tmpLang]
  100. if ok == true {
  101. // 累加
  102. value++
  103. langDict[tmpLang] = value
  104. } else {
  105. langDict[tmpLang] = 1
  106. }
  107. }
  108. }
  109. // SubLangStatistics2SubLangType 由分析的信息转换为具体是什么字幕的语言类型
  110. func SubLangStatistics2SubLangType(countLineFeed, AllLines float32, langDict map[int]int) Language {
  111. const basePer = 0.8
  112. // 是否是双语?
  113. isDouble := false
  114. perLines := countLineFeed / AllLines
  115. // 第二行字幕出现的概率大于 80% 应该稳了吧,不然还能三语?
  116. if perLines > basePer {
  117. isDouble = true
  118. }
  119. // TODO 现在是没有很好的办法去识别是简体还是繁体中文的,所以···
  120. // 中文
  121. countChinese, hasChinese := langDict[int(whatlanggo.Cmn)]
  122. // 英文
  123. countEnglish, hasEnglish := langDict[int(whatlanggo.Eng)]
  124. // 日文
  125. countJapanese, hasJapanese := langDict[int(whatlanggo.Jpn)]
  126. // 韩文
  127. countKorean, hasKorean := langDict[int(whatlanggo.Kor)]
  128. // 优先判断双语
  129. if isDouble == true {
  130. // 首先得在外面统计就知道是双语
  131. if hasChinese && hasEnglish {
  132. // 简体 英文
  133. return ChineseSimpleEnglish
  134. } else if hasChinese && hasJapanese {
  135. // 简体 日文
  136. return ChineseSimpleJapanese
  137. } else if hasChinese && hasKorean {
  138. // 简体 韩文
  139. return ChineseSimpleKorean
  140. } else if hasChinese {
  141. return ChineseSimple
  142. } else if hasEnglish {
  143. return English
  144. } else if hasJapanese {
  145. return Japanese
  146. } else if hasKorean {
  147. return Korean
  148. } else {
  149. return Unknow
  150. }
  151. } else {
  152. // 如果比例达不到,那么就是单语言,所以最多的那个就是当前的语言
  153. // 这里的字典是有可能出现
  154. if hasChinese {
  155. // 那么起码要占比 80% 对吧
  156. perLines = float32(countChinese) / AllLines
  157. if perLines > basePer {
  158. return ChineseSimple
  159. }
  160. }
  161. if hasEnglish {
  162. // 那么起码要占比 80% 对吧
  163. perLines = float32(countEnglish) / AllLines
  164. if perLines > basePer {
  165. return English
  166. }
  167. }
  168. if hasJapanese {
  169. // 那么起码要占比 80% 对吧
  170. perLines = float32(countJapanese) / AllLines
  171. if perLines > basePer {
  172. return Japanese
  173. }
  174. }
  175. if hasKorean {
  176. // 那么起码要占比 80% 对吧
  177. perLines = float32(countKorean) / AllLines
  178. if perLines > basePer {
  179. return Korean
  180. }
  181. }
  182. return Unknow
  183. }
  184. }
  185. // IsChineseSimpleOrTraditional 从字幕的文件名称中尝试确认是简体还是繁体,不需要判断双语问题,有额外的解析器完成。只可能出现 ChineseSimple ChineseTraditional Unknow 三种情况
  186. func IsChineseSimpleOrTraditional(inputFileName string) Language {
  187. if strings.Contains(inputFileName, SubNameKeywordChineseSimple) || strings.Contains(inputFileName, MatchLangChs) {
  188. return ChineseSimple
  189. } else if strings.Contains(inputFileName, SubNameKeywordTraditional) || strings.Contains(inputFileName, MatchLangCht) {
  190. return ChineseTraditional
  191. }
  192. return Unknow
  193. }
  194. const (
  195. SubNameKeywordChineseSimple = "chs"
  196. SubNameKeywordTraditional = "cht"
  197. )
  198. // Language 语言类型,注意,这里默认还是查找的是中文字幕,只不过下载的时候可能附带了其他的
  199. type Language int
  200. const (
  201. Unknow Language = iota // 未知语言
  202. ChineseSimple // 简体中文
  203. ChineseTraditional // 繁体中文
  204. ChineseSimpleEnglish // 简英双语字幕
  205. ChineseTraditionalEnglish // 繁英双语字幕
  206. English // 英文
  207. Japanese // 日语
  208. ChineseSimpleJapanese // 简日双语字幕
  209. ChineseTraditionalJapanese // 繁日双语字幕
  210. Korean // 韩语
  211. ChineseSimpleKorean // 简韩双语字幕
  212. ChineseTraditionalKorean // 繁韩双语字幕
  213. )
  214. const (
  215. MathLangChnUnknow = "未知语言"
  216. MatchLangChs = "简"
  217. MatchLangCht = "繁"
  218. MatchLangChsEn = "简英"
  219. MatchLangChtEn = "繁英"
  220. MatchLangEn = "英"
  221. MatchLangJp = "日"
  222. MatchLangChsJp = "简日"
  223. MatchLangChtJp = "繁日"
  224. MatchLangKr = "韩"
  225. MatchLangChsKr = "简韩"
  226. MatchLangChtKr = "繁韩"
  227. )
  228. func (l Language) String() string {
  229. switch l {
  230. case ChineseSimple:
  231. return MatchLangChs
  232. case ChineseTraditional:
  233. return MatchLangCht
  234. case ChineseSimpleEnglish:
  235. return MatchLangChsEn
  236. case ChineseTraditionalEnglish:
  237. return MatchLangChtEn
  238. case English:
  239. return MatchLangEn
  240. case Japanese:
  241. return MatchLangJp
  242. case ChineseSimpleJapanese:
  243. return MatchLangChsJp
  244. case ChineseTraditionalJapanese:
  245. return MatchLangChtJp
  246. case Korean:
  247. return MatchLangKr
  248. case ChineseSimpleKorean:
  249. return MatchLangChsKr
  250. case ChineseTraditionalKorean:
  251. return MatchLangChtKr
  252. default:
  253. return MathLangChnUnknow
  254. }
  255. }