language.go 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436
  1. package language
  2. import (
  3. "github.com/abadojack/whatlanggo"
  4. "github.com/allanpk716/ChineseSubFinder/internal/logic/charset"
  5. "github.com/allanpk716/ChineseSubFinder/internal/pkg/log_helper"
  6. "github.com/allanpk716/ChineseSubFinder/internal/types"
  7. "github.com/axgle/mahonia"
  8. "github.com/go-creed/sat"
  9. nzlov "github.com/nzlov/chardet"
  10. "github.com/saintfish/chardet"
  11. "strings"
  12. )
  13. // LangConverter 语言转换器
  14. func LangConverter(subLang string) types.Language {
  15. /*
  16. xunlei:未知语言、简体&英语、繁体&英语、简体、繁体、英语
  17. */
  18. if strings.Contains(subLang, types.MatchLangDouble) {
  19. // 双语 - 简英
  20. return types.ChineseSimpleEnglish
  21. } else if strings.Contains(subLang, types.MatchLangChs) {
  22. // 优先简体
  23. if strings.Contains(subLang, types.MatchLangEn) {
  24. // 简英
  25. return types.ChineseSimpleEnglish
  26. } else if strings.Contains(subLang, types.MatchLangJp) {
  27. // 简日
  28. return types.ChineseSimpleJapanese
  29. } else if strings.Contains(subLang, types.MatchLangKr) {
  30. // 简韩
  31. return types.ChineseSimpleKorean
  32. }
  33. // 默认简体中文
  34. return types.ChineseSimple
  35. } else if strings.Contains(subLang, types.MatchLangCht) {
  36. // 然后是繁体
  37. if strings.Contains(subLang, types.MatchLangEn) {
  38. // 繁英
  39. return types.ChineseTraditionalEnglish
  40. } else if strings.Contains(subLang, types.MatchLangJp) {
  41. // 繁日
  42. return types.ChineseTraditionalJapanese
  43. } else if strings.Contains(subLang, types.MatchLangKr) {
  44. // 繁韩
  45. return types.ChineseTraditionalKorean
  46. }
  47. // 默认繁体中文
  48. return types.ChineseTraditional
  49. } else if strings.Contains(subLang, types.MatchLangEn) {
  50. // 英文
  51. return types.English
  52. } else if strings.Contains(subLang, types.MatchLangJp) {
  53. // 日文
  54. return types.Japanese
  55. } else if strings.Contains(subLang, types.MatchLangKr) {
  56. // 韩文
  57. return types.Korean
  58. } else {
  59. // 都没有,则标记未知
  60. return types.Unknow
  61. }
  62. }
  63. // HasChineseLang 是否包含中文
  64. func HasChineseLang(lan types.Language) bool {
  65. switch lan {
  66. case types.ChineseSimple,
  67. types.ChineseTraditional,
  68. types.ChineseSimpleEnglish,
  69. types.ChineseTraditionalEnglish,
  70. types.ChineseSimpleJapanese,
  71. types.ChineseTraditionalJapanese,
  72. types.ChineseSimpleKorean,
  73. types.ChineseTraditionalKorean:
  74. return true
  75. default:
  76. return false
  77. }
  78. }
  79. // IsBilingualSubtitle 是否是双语字幕
  80. func IsBilingualSubtitle(lan types.Language) bool {
  81. switch lan {
  82. case types.ChineseSimpleEnglish,
  83. types.ChineseTraditionalEnglish,
  84. types.ChineseSimpleJapanese,
  85. types.ChineseTraditionalJapanese,
  86. types.ChineseSimpleKorean,
  87. types.ChineseTraditionalKorean:
  88. return true
  89. default:
  90. return false
  91. }
  92. }
  93. // Lang2EmbyNameOld 弃用。从语言转换到 Emby 能够识别的字幕命名
  94. func Lang2EmbyNameOld(lan types.Language) string {
  95. switch lan {
  96. case types.Unknow: // 未知语言
  97. return types.Emby_unknow
  98. case types.ChineseSimple: // 简体中文
  99. return types.Emby_chs
  100. case types.ChineseTraditional: // 繁体中文
  101. return types.Emby_cht
  102. case types.ChineseSimpleEnglish: // 简英双语字幕
  103. return types.Emby_chs_en
  104. case types.ChineseTraditionalEnglish: // 繁英双语字幕
  105. return types.Emby_cht_en
  106. case types.English: // 英文
  107. return types.Emby_en
  108. case types.Japanese: // 日语
  109. return types.Emby_jp
  110. case types.ChineseSimpleJapanese: // 简日双语字幕
  111. return types.Emby_chs_jp
  112. case types.ChineseTraditionalJapanese: // 繁日双语字幕
  113. return types.Emby_cht_jp
  114. case types.Korean: // 韩语
  115. return types.Emby_kr
  116. case types.ChineseSimpleKorean: // 简韩双语字幕
  117. return types.Emby_chs_kr
  118. case types.ChineseTraditionalKorean: // 繁韩双语字幕
  119. return types.Emby_cht_kr
  120. default:
  121. return types.Emby_unknow
  122. }
  123. }
  124. // Lang2ChineseString 将 types.Language 转换为中文描述:简、繁、简英
  125. func Lang2ChineseString(lan types.Language) string {
  126. switch lan {
  127. case types.Unknow: // 未知语言
  128. return types.MathLangChnUnknow
  129. case types.ChineseSimple: // 简体中文
  130. return types.MatchLangChs
  131. case types.ChineseTraditional: // 繁体中文
  132. return types.MatchLangCht
  133. case types.ChineseSimpleEnglish: // 简英双语字幕
  134. return types.MatchLangChsEn
  135. case types.ChineseTraditionalEnglish: // 繁英双语字幕
  136. return types.MatchLangChtEn
  137. case types.English: // 英文
  138. return types.MatchLangEn
  139. case types.Japanese: // 日语
  140. return types.MatchLangJp
  141. case types.ChineseSimpleJapanese: // 简日双语字幕
  142. return types.MatchLangChsJp
  143. case types.ChineseTraditionalJapanese: // 繁日双语字幕
  144. return types.MatchLangChtJp
  145. case types.Korean: // 韩语
  146. return types.MatchLangKr
  147. case types.ChineseSimpleKorean: // 简韩双语字幕
  148. return types.MatchLangChsKr
  149. case types.ChineseTraditionalKorean: // 繁韩双语字幕
  150. return types.MatchLangChtKr
  151. default:
  152. return types.MathLangChnUnknow
  153. }
  154. }
  155. // GetLangOptions 语言识别的 Options Whitelist
  156. func GetLangOptions() whatlanggo.Options {
  157. return whatlanggo.Options{
  158. Whitelist: map[whatlanggo.Lang]bool{
  159. whatlanggo.Cmn: true, // 中文 11
  160. whatlanggo.Eng: true, // 英文 15
  161. whatlanggo.Jpn: true, // 日文 32
  162. whatlanggo.Kor: true, // 韩文 37
  163. },
  164. }
  165. }
  166. // IsWhiteListLang 是否是白名单语言
  167. func IsWhiteListLang(lang whatlanggo.Lang) bool {
  168. switch lang {
  169. // 中文 英文 日文 韩文
  170. case whatlanggo.Cmn, whatlanggo.Eng, whatlanggo.Jpn, whatlanggo.Kor:
  171. return true
  172. default:
  173. return false
  174. }
  175. }
  176. // DetectSubLangAndStatistics 检测语言然后统计
  177. func DetectSubLangAndStatistics(lines []string, langDict map[int]int, chLines *[]string) {
  178. for _, line := range lines {
  179. info := whatlanggo.DetectWithOptions(line, GetLangOptions())
  180. tmpLang := -1
  181. if IsWhiteListLang(info.Lang) == true {
  182. tmpLang = (int)(info.Lang)
  183. }
  184. // 这一种语言的 key 是否存在,不存在则新建,存在再数值 +1
  185. value, ok := langDict[tmpLang]
  186. if ok == true {
  187. // 累加
  188. value++
  189. langDict[tmpLang] = value
  190. } else {
  191. langDict[tmpLang] = 1
  192. }
  193. // 统计中文有多少行
  194. if info.Lang == whatlanggo.Cmn {
  195. *chLines = append(*chLines, line)
  196. }
  197. }
  198. }
  199. // SubLangStatistics2SubLangType 由分析的信息转换为具体是什么字幕的语言类型
  200. func SubLangStatistics2SubLangType(countLineFeed, AllLines float32, langDict map[int]int, chLines []string) types.Language {
  201. const basePer = 0.8
  202. // 是否是双语?
  203. isDouble := false
  204. perLines := countLineFeed / AllLines
  205. // 第二行字幕出现的概率大于 80% 应该稳了吧,不然还能三语?
  206. if perLines > basePer {
  207. isDouble = true
  208. }
  209. // 中文(包含了 chs 以及 cht,这一级是无法区分的,需要额外的简体和繁体区分方法)
  210. countChinese, hasChinese := langDict[int(whatlanggo.Cmn)]
  211. // 英文
  212. countEnglish, hasEnglish := langDict[int(whatlanggo.Eng)]
  213. // 日文
  214. countJapanese, hasJapanese := langDict[int(whatlanggo.Jpn)]
  215. // 韩文
  216. countKorean, hasKorean := langDict[int(whatlanggo.Kor)]
  217. // 0 - No , 1 - Chs, 2 - Cht
  218. isNoOrChsOrCht := 0
  219. isChsCount := 0
  220. if hasChinese {
  221. for _, line := range chLines {
  222. if chDict.IsChs(line, 0.9) == true {
  223. isChsCount++
  224. }
  225. }
  226. // 简体句子的占比超过 80%
  227. if float32(isChsCount)/float32(len(chLines)) > 0.8 {
  228. isNoOrChsOrCht = 1
  229. } else {
  230. isNoOrChsOrCht = 2
  231. }
  232. }
  233. // 这里有一种情况,就是双语的字幕不是在一个时间轴上的,而是分成两个时间轴的
  234. // 那么之前的 isDouble 判断就失效了,需要补判一次
  235. if isDouble == false {
  236. if hasChinese && hasEnglish {
  237. isDouble = isDoubleLang(countChinese, countEnglish)
  238. }
  239. if hasChinese && hasJapanese {
  240. isDouble = isDoubleLang(countChinese, countJapanese)
  241. }
  242. if hasChinese && hasKorean {
  243. isDouble = isDoubleLang(countChinese, countKorean)
  244. }
  245. }
  246. // 优先判断双语
  247. if isDouble == true {
  248. // 首先得在外面统计就知道是双语
  249. if hasChinese && hasEnglish {
  250. // 简体 英文
  251. return chIsChsOrCht(types.ChineseSimpleEnglish, isNoOrChsOrCht)
  252. } else if hasChinese && hasJapanese {
  253. // 简体 日文
  254. return chIsChsOrCht(types.ChineseSimpleJapanese, isNoOrChsOrCht)
  255. } else if hasChinese && hasKorean {
  256. // 简体 韩文
  257. return chIsChsOrCht(types.ChineseSimpleKorean, isNoOrChsOrCht)
  258. } else if hasChinese {
  259. return chIsChsOrCht(types.ChineseSimple, isNoOrChsOrCht)
  260. } else if hasEnglish {
  261. return types.English
  262. } else if hasJapanese {
  263. return types.Japanese
  264. } else if hasKorean {
  265. return types.Korean
  266. } else {
  267. return types.Unknow
  268. }
  269. } else {
  270. // 如果比例达不到,那么就是单语言,所以最多的那个就是当前的语言
  271. // 这里的字典是有可能出现
  272. if hasChinese {
  273. // 那么起码要占比 80% 对吧
  274. perLines = float32(countChinese) / AllLines
  275. if perLines > basePer {
  276. return chIsChsOrCht(types.ChineseSimple, isNoOrChsOrCht)
  277. }
  278. }
  279. if hasEnglish {
  280. // 那么起码要占比 80% 对吧
  281. perLines = float32(countEnglish) / AllLines
  282. if perLines > basePer {
  283. return types.English
  284. }
  285. }
  286. if hasJapanese {
  287. // 那么起码要占比 80% 对吧
  288. perLines = float32(countJapanese) / AllLines
  289. if perLines > basePer {
  290. return types.Japanese
  291. }
  292. }
  293. if hasKorean {
  294. // 那么起码要占比 80% 对吧
  295. perLines = float32(countKorean) / AllLines
  296. if perLines > basePer {
  297. return types.Korean
  298. }
  299. }
  300. return types.Unknow
  301. }
  302. }
  303. // 跟中文相关的再使用,其他的无需传入
  304. func chIsChsOrCht(language types.Language, isNoOrChsOrCht int) types.Language {
  305. // 输出原来的
  306. if isNoOrChsOrCht == 0 || isNoOrChsOrCht == 1 {
  307. return language
  308. }
  309. switch language {
  310. case types.ChineseSimpleEnglish:
  311. // 简体 英文
  312. return types.ChineseTraditionalEnglish
  313. case types.ChineseSimpleJapanese:
  314. // 简体 日文
  315. return types.ChineseTraditionalJapanese
  316. case types.ChineseSimpleKorean:
  317. // 简体 韩文
  318. return types.ChineseTraditionalKorean
  319. case types.ChineseSimple:
  320. // 简体
  321. return types.ChineseTraditional
  322. default:
  323. return language
  324. }
  325. }
  326. // IsChineseSimpleOrTraditional 暂时弃用,在 SubLangStatistics2SubLangType 检测语言,通过 unicode 做到。 从字幕的文件名称中尝试确认是简体还是繁体,不需要判断双语问题,有额外的解析器完成。只可能出现 ChineseSimple ChineseTraditional Unknow 三种情况
  327. func IsChineseSimpleOrTraditional(inputFileName string, orgLang types.Language) types.Language {
  328. if strings.Contains(inputFileName, types.SubNameKeywordChineseSimple) || strings.Contains(inputFileName, types.MatchLangChs) {
  329. // 简体中文关键词的匹配
  330. return orgLang
  331. } else if strings.Contains(inputFileName, types.SubNameKeywordTraditional) || strings.Contains(inputFileName, types.MatchLangCht) {
  332. // 繁体中文关键词的匹配
  333. if orgLang == types.ChineseSimple {
  334. // 简体 -> 繁体
  335. return types.ChineseTraditional
  336. } else if orgLang == types.ChineseSimpleEnglish {
  337. // 简体英文 -> 繁体英文
  338. return types.ChineseTraditionalEnglish
  339. } else if orgLang == types.ChineseSimpleJapanese {
  340. // 简体日文 -> 繁体日文
  341. return types.ChineseTraditionalJapanese
  342. } else if orgLang == types.ChineseSimpleKorean {
  343. // 简体韩文 -> 繁体韩文
  344. return types.ChineseTraditionalKorean
  345. }
  346. // 进来了都不是,那么就返回原来的语言
  347. return orgLang
  348. } else {
  349. // 都没有匹配上,返回原来识别出来的类型即可
  350. return orgLang
  351. }
  352. }
  353. // ConvertToString 将字符串从原始编码转换到目标编码,需要配合字符串检测编码库使用 chardet.NewTextDetector()
  354. func ConvertToString(src string, srcCode string, tagCode string) string {
  355. defer func() {
  356. if err := recover(); err != nil {
  357. log_helper.GetLogger().Errorln("ConvertToString panic:", err)
  358. }
  359. }()
  360. srcCoder := mahonia.NewDecoder(srcCode)
  361. srcResult := srcCoder.ConvertString(src)
  362. tagCoder := mahonia.NewDecoder(tagCode)
  363. _, cdata, _ := tagCoder.Translate([]byte(srcResult), true)
  364. result := string(cdata)
  365. return result
  366. }
  367. // 感谢: https://blog.csdn.net/gaoluhua/article/details/109128154,解决了编码问题
  368. // ChangeFileCoding2UTF8 自动检测文件的编码,然后转换到 UTF-8
  369. func ChangeFileCoding2UTF8(inBytes []byte) ([]byte, error) {
  370. best, err := detector.DetectBest(inBytes)
  371. utf8String := ""
  372. if err != nil {
  373. return nil, err
  374. }
  375. if best.Confidence < 90 {
  376. detectBest := nzlov.Mostlike(inBytes)
  377. utf8String, err = charset.ToUTF8(charset.Charset(detectBest), string(inBytes))
  378. } else {
  379. utf8String, err = charset.ToUTF8(charset.Charset(best.Charset), string(inBytes))
  380. }
  381. if err != nil {
  382. return nil, err
  383. }
  384. if utf8String == "" {
  385. return inBytes, nil
  386. }
  387. return []byte(utf8String), nil
  388. }
  389. func isDoubleLang(count0, count1 int) bool {
  390. if count0 >= count1 {
  391. f := float32(count0) / float32(count1)
  392. if f >= 1 && f <= 1.4 {
  393. return true
  394. } else {
  395. return false
  396. }
  397. } else {
  398. f := float32(count1) / float32(count0)
  399. if f >= 1 && f <= 1.4 {
  400. return true
  401. } else {
  402. return false
  403. }
  404. }
  405. }
  406. var (
  407. chDict = sat.DefaultDict()
  408. detector = chardet.NewTextDetector()
  409. )