language.go 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526
  1. package language
  2. import (
  3. "github.com/abadojack/whatlanggo"
  4. "github.com/allanpk716/ChineseSubFinder/internal/logic/charset"
  5. "github.com/allanpk716/ChineseSubFinder/internal/pkg/log_helper"
  6. "github.com/allanpk716/ChineseSubFinder/internal/types"
  7. "github.com/allanpk716/ChineseSubFinder/internal/types/subparser"
  8. "github.com/axgle/mahonia"
  9. "github.com/go-creed/sat"
  10. nzlov "github.com/nzlov/chardet"
  11. "github.com/saintfish/chardet"
  12. "strings"
  13. )
  14. // LangConverter 语言转换器
  15. func LangConverter(subLang string) types.Language {
  16. /*
  17. xunlei:未知语言、简体&英语、繁体&英语、简体、繁体、英语
  18. */
  19. if strings.Contains(subLang, types.MatchLangDouble) {
  20. // 双语 - 简英
  21. return types.ChineseSimpleEnglish
  22. } else if strings.Contains(subLang, types.MatchLangChs) {
  23. // 优先简体
  24. if strings.Contains(subLang, types.MatchLangEn) {
  25. // 简英
  26. return types.ChineseSimpleEnglish
  27. } else if strings.Contains(subLang, types.MatchLangJp) {
  28. // 简日
  29. return types.ChineseSimpleJapanese
  30. } else if strings.Contains(subLang, types.MatchLangKr) {
  31. // 简韩
  32. return types.ChineseSimpleKorean
  33. }
  34. // 默认简体中文
  35. return types.ChineseSimple
  36. } else if strings.Contains(subLang, types.MatchLangCht) {
  37. // 然后是繁体
  38. if strings.Contains(subLang, types.MatchLangEn) {
  39. // 繁英
  40. return types.ChineseTraditionalEnglish
  41. } else if strings.Contains(subLang, types.MatchLangJp) {
  42. // 繁日
  43. return types.ChineseTraditionalJapanese
  44. } else if strings.Contains(subLang, types.MatchLangKr) {
  45. // 繁韩
  46. return types.ChineseTraditionalKorean
  47. }
  48. // 默认繁体中文
  49. return types.ChineseTraditional
  50. } else if strings.Contains(subLang, types.MatchLangEn) {
  51. // 英文
  52. return types.English
  53. } else if strings.Contains(subLang, types.MatchLangJp) {
  54. // 日文
  55. return types.Japanese
  56. } else if strings.Contains(subLang, types.MatchLangKr) {
  57. // 韩文
  58. return types.Korean
  59. } else {
  60. // 都没有,则标记未知
  61. return types.Unknow
  62. }
  63. }
  64. // HasChineseLang 是否包含中文
  65. func HasChineseLang(lan types.Language) bool {
  66. switch lan {
  67. case types.ChineseSimple,
  68. types.ChineseTraditional,
  69. types.ChineseSimpleEnglish,
  70. types.ChineseTraditionalEnglish,
  71. types.ChineseSimpleJapanese,
  72. types.ChineseTraditionalJapanese,
  73. types.ChineseSimpleKorean,
  74. types.ChineseTraditionalKorean:
  75. return true
  76. default:
  77. return false
  78. }
  79. }
  80. // IsBilingualSubtitle 是否是双语字幕
  81. func IsBilingualSubtitle(lan types.Language) bool {
  82. switch lan {
  83. case types.ChineseSimpleEnglish,
  84. types.ChineseTraditionalEnglish,
  85. types.ChineseSimpleJapanese,
  86. types.ChineseTraditionalJapanese,
  87. types.ChineseSimpleKorean,
  88. types.ChineseTraditionalKorean:
  89. return true
  90. default:
  91. return false
  92. }
  93. }
  94. // Lang2EmbyNameOld 弃用。从语言转换到 Emby 能够识别的字幕命名
  95. func Lang2EmbyNameOld(lan types.Language) string {
  96. switch lan {
  97. case types.Unknow: // 未知语言
  98. return types.Emby_unknow
  99. case types.ChineseSimple: // 简体中文
  100. return types.Emby_chs
  101. case types.ChineseTraditional: // 繁体中文
  102. return types.Emby_cht
  103. case types.ChineseSimpleEnglish: // 简英双语字幕
  104. return types.Emby_chs_en
  105. case types.ChineseTraditionalEnglish: // 繁英双语字幕
  106. return types.Emby_cht_en
  107. case types.English: // 英文
  108. return types.Emby_en
  109. case types.Japanese: // 日语
  110. return types.Emby_jp
  111. case types.ChineseSimpleJapanese: // 简日双语字幕
  112. return types.Emby_chs_jp
  113. case types.ChineseTraditionalJapanese: // 繁日双语字幕
  114. return types.Emby_cht_jp
  115. case types.Korean: // 韩语
  116. return types.Emby_kr
  117. case types.ChineseSimpleKorean: // 简韩双语字幕
  118. return types.Emby_chs_kr
  119. case types.ChineseTraditionalKorean: // 繁韩双语字幕
  120. return types.Emby_cht_kr
  121. default:
  122. return types.Emby_unknow
  123. }
  124. }
  125. // Lang2ChineseString 将 types.Language 转换为中文描述:简、繁、简英
  126. func Lang2ChineseString(lan types.Language) string {
  127. switch lan {
  128. case types.Unknow: // 未知语言
  129. return types.MathLangChnUnknow
  130. case types.ChineseSimple: // 简体中文
  131. return types.MatchLangChs
  132. case types.ChineseTraditional: // 繁体中文
  133. return types.MatchLangCht
  134. case types.ChineseSimpleEnglish: // 简英双语字幕
  135. return types.MatchLangChsEn
  136. case types.ChineseTraditionalEnglish: // 繁英双语字幕
  137. return types.MatchLangChtEn
  138. case types.English: // 英文
  139. return types.MatchLangEn
  140. case types.Japanese: // 日语
  141. return types.MatchLangJp
  142. case types.ChineseSimpleJapanese: // 简日双语字幕
  143. return types.MatchLangChsJp
  144. case types.ChineseTraditionalJapanese: // 繁日双语字幕
  145. return types.MatchLangChtJp
  146. case types.Korean: // 韩语
  147. return types.MatchLangKr
  148. case types.ChineseSimpleKorean: // 简韩双语字幕
  149. return types.MatchLangChsKr
  150. case types.ChineseTraditionalKorean: // 繁韩双语字幕
  151. return types.MatchLangChtKr
  152. default:
  153. return types.MathLangChnUnknow
  154. }
  155. }
  156. // ChineseISOString2Lang 将 中文描述:zh、zho、chi 转换为 types.Language
  157. func ChineseISOString2Lang(chineseStr string) types.Language {
  158. switch chineseStr {
  159. case types.ChineseAbbr_639_1, types.ChineseAbbr_639_2T, types.ChineseAbbr_639_2B:
  160. return types.ChineseSimple
  161. default:
  162. return types.Unknow
  163. }
  164. }
  165. // ISOString2SupportLang 从 639-2/B 的语言缩写字符串转换为内部的 Language 类型,值支持
  166. // https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes
  167. func ISOString2SupportLang(isoString string) types.Language {
  168. switch strings.ToLower(isoString) {
  169. case types.Lang_639_2B_Chinese:
  170. return types.ChineseSimple
  171. case types.Lang_639_2B_English:
  172. return types.English
  173. case types.Lang_639_2B_Japan:
  174. return types.Japanese
  175. case types.Lang_639_2B_Korean:
  176. return types.Korean
  177. default:
  178. return types.Unknow
  179. }
  180. }
  181. // IsSupportISOString 是否是受支持的 639-2/B 语言,中、英、日、韩
  182. func IsSupportISOString(isoString string) bool {
  183. switch strings.ToLower(isoString) {
  184. case types.Lang_639_2B_Chinese, types.Lang_639_2B_English, types.Lang_639_2B_Japan, types.Lang_639_2B_Korean:
  185. return true
  186. default:
  187. return false
  188. }
  189. }
  190. // ChineseString2Lang 将 中文描述:简、繁、简英 转换为 types.Language
  191. func ChineseString2Lang(chineseStr string) types.Language {
  192. switch chineseStr {
  193. case types.MathLangChnUnknow: // 未知语言
  194. return types.Unknow
  195. case types.MatchLangChs: // 简体中文
  196. return types.ChineseSimple
  197. case types.MatchLangCht: // 繁体中文
  198. return types.ChineseTraditional
  199. case types.MatchLangChsEn: // 简英双语字幕
  200. return types.ChineseSimpleEnglish
  201. case types.MatchLangChtEn: // 繁英双语字幕
  202. return types.ChineseTraditionalEnglish
  203. case types.MatchLangEn: // 英文
  204. return types.English
  205. case types.MatchLangJp: // 日语
  206. return types.Japanese
  207. case types.MatchLangChsJp: // 简日双语字幕
  208. return types.ChineseSimpleJapanese
  209. case types.MatchLangChtJp: // 繁日双语字幕
  210. return types.ChineseTraditionalJapanese
  211. case types.MatchLangKr: // 韩语
  212. return types.Korean
  213. case types.MatchLangChsKr: // 简韩双语字幕
  214. return types.ChineseSimpleKorean
  215. case types.MatchLangChtKr: // 繁韩双语字幕
  216. return types.ChineseTraditionalKorean
  217. default:
  218. return types.Unknow
  219. }
  220. }
  221. // GetLangOptions 语言识别的 Options Whitelist
  222. func GetLangOptions() whatlanggo.Options {
  223. return whatlanggo.Options{
  224. Whitelist: map[whatlanggo.Lang]bool{
  225. whatlanggo.Cmn: true, // 中文 11
  226. whatlanggo.Eng: true, // 英文 15
  227. whatlanggo.Jpn: true, // 日文 32
  228. whatlanggo.Kor: true, // 韩文 37
  229. },
  230. }
  231. }
  232. // IsWhiteListLang 是否是白名单语言
  233. func IsWhiteListLang(lang whatlanggo.Lang) bool {
  234. switch lang {
  235. // 中文 英文 日文 韩文
  236. case whatlanggo.Cmn, whatlanggo.Eng, whatlanggo.Jpn, whatlanggo.Kor:
  237. return true
  238. default:
  239. return false
  240. }
  241. }
  242. // DetectSubLangAndStatistics 检测语言然后统计
  243. func DetectSubLangAndStatistics(oneDialogue subparser.OneDialogue, langDict map[int]int, usefulDialoguseEx *[]subparser.OneDialogueEx, chLines *[]string, otherLines *[]string) {
  244. var oneDialogueEx subparser.OneDialogueEx
  245. oneDialogueEx.StartTime = oneDialogue.StartTime
  246. oneDialogueEx.EndTime = oneDialogue.EndTime
  247. for _, line := range oneDialogue.Lines {
  248. info := whatlanggo.DetectWithOptions(line, GetLangOptions())
  249. tmpLang := -1
  250. if IsWhiteListLang(info.Lang) == true {
  251. tmpLang = (int)(info.Lang)
  252. }
  253. // 这一种语言的 key 是否存在,不存在则新建,存在再数值 +1
  254. value, ok := langDict[tmpLang]
  255. if ok == true {
  256. // 累加
  257. value++
  258. langDict[tmpLang] = value
  259. } else {
  260. langDict[tmpLang] = 1
  261. }
  262. // 统计中文有多少行
  263. if info.Lang == whatlanggo.Cmn {
  264. *chLines = append(*chLines, line)
  265. } else {
  266. *otherLines = append(*otherLines, line)
  267. }
  268. // 这里可能是一个 dialogue 里面有两句话,而且两句话都是一个类型的语言,所以其实需要的是合并
  269. switch info.Lang {
  270. case whatlanggo.Cmn:
  271. oneDialogueEx.ChLine += line + " "
  272. case whatlanggo.Eng:
  273. oneDialogueEx.EnLine += line + " "
  274. case whatlanggo.Kor:
  275. oneDialogueEx.KrLine += line + " "
  276. case whatlanggo.Jpn:
  277. oneDialogueEx.JpLine += line + " "
  278. }
  279. }
  280. *usefulDialoguseEx = append(*usefulDialoguseEx, oneDialogueEx)
  281. }
  282. // SubLangStatistics2SubLangType 由分析的信息转换为具体是什么字幕的语言类型
  283. func SubLangStatistics2SubLangType(countLineFeed, AllLines float32, langDict map[int]int, chLines []string) types.Language {
  284. const basePer = 0.8
  285. // 是否是双语?
  286. isDouble := false
  287. perLines := countLineFeed / AllLines
  288. // 第二行字幕出现的概率大于 80% 应该稳了吧,不然还能三语?
  289. if perLines > basePer {
  290. isDouble = true
  291. }
  292. // 中文(包含了 chs 以及 cht,这一级是无法区分的,需要额外的简体和繁体区分方法)
  293. countChinese, hasChinese := langDict[int(whatlanggo.Cmn)]
  294. // 英文
  295. countEnglish, hasEnglish := langDict[int(whatlanggo.Eng)]
  296. // 日文
  297. countJapanese, hasJapanese := langDict[int(whatlanggo.Jpn)]
  298. // 韩文
  299. countKorean, hasKorean := langDict[int(whatlanggo.Kor)]
  300. // 0 - No , 1 - Chs, 2 - Cht
  301. isNoOrChsOrCht := 0
  302. isChsCount := 0
  303. if hasChinese {
  304. for _, line := range chLines {
  305. // 判断是简体还是繁体
  306. if chDict.IsChs(line, 0.9) == true {
  307. isChsCount++
  308. }
  309. }
  310. // 简体句子的占比超过 80%
  311. if float32(isChsCount)/float32(len(chLines)) > 0.8 {
  312. isNoOrChsOrCht = 1
  313. } else {
  314. isNoOrChsOrCht = 2
  315. }
  316. }
  317. // 这里有一种情况,就是双语的字幕不是在一个时间轴上的,而是分成两个时间轴的
  318. // 那么之前的 isDouble 判断就失效了,需要补判一次
  319. if isDouble == false {
  320. if hasChinese && hasEnglish {
  321. isDouble = isDoubleLang(countChinese, countEnglish)
  322. }
  323. if hasChinese && hasJapanese {
  324. isDouble = isDoubleLang(countChinese, countJapanese)
  325. }
  326. if hasChinese && hasKorean {
  327. isDouble = isDoubleLang(countChinese, countKorean)
  328. }
  329. }
  330. // 优先判断双语
  331. if isDouble == true {
  332. // 首先得在外面统计就知道是双语
  333. if hasChinese && hasEnglish {
  334. // 简体 英文
  335. return chIsChsOrCht(types.ChineseSimpleEnglish, isNoOrChsOrCht)
  336. } else if hasChinese && hasJapanese {
  337. // 简体 日文
  338. return chIsChsOrCht(types.ChineseSimpleJapanese, isNoOrChsOrCht)
  339. } else if hasChinese && hasKorean {
  340. // 简体 韩文
  341. return chIsChsOrCht(types.ChineseSimpleKorean, isNoOrChsOrCht)
  342. } else if hasChinese {
  343. return chIsChsOrCht(types.ChineseSimple, isNoOrChsOrCht)
  344. } else if hasEnglish {
  345. return types.English
  346. } else if hasJapanese {
  347. return types.Japanese
  348. } else if hasKorean {
  349. return types.Korean
  350. } else {
  351. return types.Unknow
  352. }
  353. } else {
  354. // 如果比例达不到,那么就是单语言,所以最多的那个就是当前的语言
  355. // 这里的字典是有可能出现
  356. if hasChinese {
  357. // 那么起码要占比 80% 对吧
  358. perLines = float32(countChinese) / AllLines
  359. if perLines > basePer {
  360. return chIsChsOrCht(types.ChineseSimple, isNoOrChsOrCht)
  361. }
  362. }
  363. if hasEnglish {
  364. // 那么起码要占比 80% 对吧
  365. perLines = float32(countEnglish) / AllLines
  366. if perLines > basePer {
  367. return types.English
  368. }
  369. }
  370. if hasJapanese {
  371. // 那么起码要占比 80% 对吧
  372. perLines = float32(countJapanese) / AllLines
  373. if perLines > basePer {
  374. return types.Japanese
  375. }
  376. }
  377. if hasKorean {
  378. // 那么起码要占比 80% 对吧
  379. perLines = float32(countKorean) / AllLines
  380. if perLines > basePer {
  381. return types.Korean
  382. }
  383. }
  384. return types.Unknow
  385. }
  386. }
  387. // 跟中文相关的再使用,其他的无需传入
  388. func chIsChsOrCht(language types.Language, isNoOrChsOrCht int) types.Language {
  389. // 输出原来的
  390. if isNoOrChsOrCht == 0 || isNoOrChsOrCht == 1 {
  391. return language
  392. }
  393. switch language {
  394. case types.ChineseSimpleEnglish:
  395. // 简体 英文
  396. return types.ChineseTraditionalEnglish
  397. case types.ChineseSimpleJapanese:
  398. // 简体 日文
  399. return types.ChineseTraditionalJapanese
  400. case types.ChineseSimpleKorean:
  401. // 简体 韩文
  402. return types.ChineseTraditionalKorean
  403. case types.ChineseSimple:
  404. // 简体
  405. return types.ChineseTraditional
  406. default:
  407. return language
  408. }
  409. }
  410. // IsChineseSimpleOrTraditional 暂时弃用,在 SubLangStatistics2SubLangType 检测语言,通过 unicode 做到。 从字幕的文件名称中尝试确认是简体还是繁体,不需要判断双语问题,有额外的解析器完成。只可能出现 ChineseSimple ChineseTraditional Unknow 三种情况
  411. func IsChineseSimpleOrTraditional(inputFileName string, orgLang types.Language) types.Language {
  412. if strings.Contains(inputFileName, types.SubNameKeywordChineseSimple) || strings.Contains(inputFileName, types.MatchLangChs) {
  413. // 简体中文关键词的匹配
  414. return orgLang
  415. } else if strings.Contains(inputFileName, types.SubNameKeywordTraditional) || strings.Contains(inputFileName, types.MatchLangCht) {
  416. // 繁体中文关键词的匹配
  417. if orgLang == types.ChineseSimple {
  418. // 简体 -> 繁体
  419. return types.ChineseTraditional
  420. } else if orgLang == types.ChineseSimpleEnglish {
  421. // 简体英文 -> 繁体英文
  422. return types.ChineseTraditionalEnglish
  423. } else if orgLang == types.ChineseSimpleJapanese {
  424. // 简体日文 -> 繁体日文
  425. return types.ChineseTraditionalJapanese
  426. } else if orgLang == types.ChineseSimpleKorean {
  427. // 简体韩文 -> 繁体韩文
  428. return types.ChineseTraditionalKorean
  429. }
  430. // 进来了都不是,那么就返回原来的语言
  431. return orgLang
  432. } else {
  433. // 都没有匹配上,返回原来识别出来的类型即可
  434. return orgLang
  435. }
  436. }
  437. // ConvertToString 将字符串从原始编码转换到目标编码,需要配合字符串检测编码库使用 chardet.NewTextDetector()
  438. func ConvertToString(src string, srcCode string, tagCode string) string {
  439. defer func() {
  440. if err := recover(); err != nil {
  441. log_helper.GetLogger().Errorln("ConvertToString panic:", err)
  442. }
  443. }()
  444. srcCoder := mahonia.NewDecoder(srcCode)
  445. srcResult := srcCoder.ConvertString(src)
  446. tagCoder := mahonia.NewDecoder(tagCode)
  447. _, cdata, _ := tagCoder.Translate([]byte(srcResult), true)
  448. result := string(cdata)
  449. return result
  450. }
  451. // 感谢: https://blog.csdn.net/gaoluhua/article/details/109128154,解决了编码问题
  452. // ChangeFileCoding2UTF8 自动检测文件的编码,然后转换到 UTF-8
  453. func ChangeFileCoding2UTF8(inBytes []byte) ([]byte, error) {
  454. best, err := detector.DetectBest(inBytes)
  455. utf8String := ""
  456. if err != nil {
  457. return nil, err
  458. }
  459. if best.Confidence < 90 {
  460. detectBest := nzlov.Mostlike(inBytes)
  461. utf8String, err = charset.ToUTF8(charset.Charset(detectBest), string(inBytes))
  462. } else {
  463. utf8String, err = charset.ToUTF8(charset.Charset(best.Charset), string(inBytes))
  464. }
  465. if err != nil {
  466. return nil, err
  467. }
  468. if utf8String == "" {
  469. return inBytes, nil
  470. }
  471. return []byte(utf8String), nil
  472. }
  473. func isDoubleLang(count0, count1 int) bool {
  474. if count0 >= count1 {
  475. f := float32(count0) / float32(count1)
  476. if f >= 1 && f <= 1.4 {
  477. return true
  478. } else {
  479. return false
  480. }
  481. } else {
  482. f := float32(count1) / float32(count0)
  483. if f >= 1 && f <= 1.4 {
  484. return true
  485. } else {
  486. return false
  487. }
  488. }
  489. }
  490. var (
  491. chDict = sat.DefaultDict()
  492. detector = chardet.NewTextDetector()
  493. )