sub_helper.go 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432
  1. package sub_helper
  2. import (
  3. "github.com/allanpk716/ChineseSubFinder/internal/common"
  4. "github.com/allanpk716/ChineseSubFinder/internal/pkg/archive_helper"
  5. "github.com/allanpk716/ChineseSubFinder/internal/pkg/decode"
  6. "github.com/allanpk716/ChineseSubFinder/internal/pkg/language"
  7. "github.com/allanpk716/ChineseSubFinder/internal/pkg/log_helper"
  8. "github.com/allanpk716/ChineseSubFinder/internal/pkg/my_util"
  9. "github.com/allanpk716/ChineseSubFinder/internal/pkg/sub_parser_hub"
  10. "github.com/allanpk716/ChineseSubFinder/internal/types/subparser"
  11. "github.com/allanpk716/ChineseSubFinder/internal/types/supplier"
  12. "github.com/go-rod/rod/lib/utils"
  13. "io/ioutil"
  14. "os"
  15. "path/filepath"
  16. "regexp"
  17. "strconv"
  18. "strings"
  19. "time"
  20. )
  21. // OrganizeDlSubFiles 需要从汇总来是网站字幕中,解压对应的压缩包中的字幕出来
  22. func OrganizeDlSubFiles(tmpFolderName string, subInfos []supplier.SubInfo) (map[string][]string, error) {
  23. // 缓存列表,整理后的字幕列表
  24. // SxEx - []string 字幕的路径
  25. var siteSubInfoDict = make(map[string][]string)
  26. tmpFolderFullPath, err := my_util.GetTmpFolder(tmpFolderName)
  27. if err != nil {
  28. return nil, err
  29. }
  30. // 把后缀名给改好
  31. ChangeVideoExt2SubExt(subInfos)
  32. // 第三方的解压库,首先不支持 io.Reader 的操作,也就是得缓存到本地硬盘再读取解压
  33. // 且使用 walk 会无法解压 rar,得指定具体的实例,太麻烦了,直接用通用的接口得了,就是得都缓存下来再判断
  34. // 基于以上两点,写了一堆啰嗦的逻辑···
  35. for i := range subInfos {
  36. // 先存下来,保存是时候需要前缀,前缀就是从那个网站下载来的
  37. nowFileSaveFullPath := filepath.Join(tmpFolderFullPath, GetFrontNameAndOrgName(&subInfos[i]))
  38. err = utils.OutputFile(nowFileSaveFullPath, subInfos[i].Data)
  39. if err != nil {
  40. log_helper.GetLogger().Errorln("getFrontNameAndOrgName - OutputFile", subInfos[i].FromWhere, subInfos[i].Name, subInfos[i].TopN, err)
  41. continue
  42. }
  43. nowExt := strings.ToLower(subInfos[i].Ext)
  44. epsKey := my_util.GetEpisodeKeyName(subInfos[i].Season, subInfos[i].Episode)
  45. _, ok := siteSubInfoDict[epsKey]
  46. if ok == false {
  47. // 不存在则实例化
  48. siteSubInfoDict[epsKey] = make([]string, 0)
  49. }
  50. if nowExt != ".zip" && nowExt != ".tar" && nowExt != ".rar" && nowExt != ".7z" {
  51. // 是否是受支持的字幕类型
  52. if sub_parser_hub.IsSubExtWanted(nowExt) == false {
  53. continue
  54. }
  55. // 加入缓存列表
  56. siteSubInfoDict[epsKey] = append(siteSubInfoDict[epsKey], nowFileSaveFullPath)
  57. } else {
  58. // 那么就是需要解压的文件了
  59. // 解压,给一个单独的文件夹
  60. unzipTmpFolder := filepath.Join(tmpFolderFullPath, subInfos[i].FromWhere)
  61. err = os.MkdirAll(unzipTmpFolder, os.ModePerm)
  62. if err != nil {
  63. return nil, err
  64. }
  65. err = archive_helper.UnArchiveFile(nowFileSaveFullPath, unzipTmpFolder)
  66. // 解压完成后,遍历受支持的字幕列表,加入缓存列表
  67. if err != nil {
  68. log_helper.GetLogger().Errorln("archiver.UnArchive", subInfos[i].FromWhere, subInfos[i].Name, subInfos[i].TopN, err)
  69. continue
  70. }
  71. // 搜索这个目录下的所有符合字幕格式的文件
  72. subFileFullPaths, err := SearchMatchedSubFileByDir(unzipTmpFolder)
  73. if err != nil {
  74. log_helper.GetLogger().Errorln("searchMatchedSubFile", subInfos[i].FromWhere, subInfos[i].Name, subInfos[i].TopN, err)
  75. continue
  76. }
  77. // 这里需要给这些下载到的文件进行改名,加是从那个网站来的前缀,后续好查找
  78. for _, fileFullPath := range subFileFullPaths {
  79. newSubName := AddFrontName(subInfos[i], filepath.Base(fileFullPath))
  80. newSubNameFullPath := filepath.Join(tmpFolderFullPath, newSubName)
  81. // 改名
  82. err = os.Rename(fileFullPath, newSubNameFullPath)
  83. if err != nil {
  84. log_helper.GetLogger().Errorln("os.Rename", subInfos[i].FromWhere, subInfos[i].Name, subInfos[i].TopN, err)
  85. continue
  86. }
  87. // 加入缓存列表
  88. siteSubInfoDict[epsKey] = append(siteSubInfoDict[epsKey], newSubNameFullPath)
  89. }
  90. }
  91. }
  92. return siteSubInfoDict, nil
  93. }
  94. // ChangeVideoExt2SubExt 检测 Name,如果是视频的后缀名就改为字幕的后缀名
  95. func ChangeVideoExt2SubExt(subInfos []supplier.SubInfo) {
  96. for x, info := range subInfos {
  97. tmpSubFileName := info.Name
  98. // 如果后缀名是下载字幕目标的后缀名 或者 是压缩包格式的,则跳过
  99. if strings.Contains(tmpSubFileName, info.Ext) == true || archive_helper.IsWantedArchiveExtName(tmpSubFileName) == true {
  100. } else {
  101. subInfos[x].Name = tmpSubFileName + info.Ext
  102. }
  103. }
  104. }
  105. // SelectChineseBestBilingualSubtitle 找到合适的双语中文字幕,简体->繁体,以及 字幕类型的优先级选择
  106. func SelectChineseBestBilingualSubtitle(subs []subparser.FileInfo, subTypePriority int) *subparser.FileInfo {
  107. // 先傻一点实现优先双语的,之前的写法有 bug
  108. for _, info := range subs {
  109. // 找到了中文字幕
  110. if language.HasChineseLang(info.Lang) == true {
  111. // 字幕的优先级 0 - 原样, 1 - srt , 2 - ass/ssa
  112. if subTypePriority == 1 {
  113. // 1 - srt
  114. if strings.ToLower(info.Ext) == common.SubExtSRT {
  115. // 优先双语
  116. if language.IsBilingualSubtitle(info.Lang) == true {
  117. return &info
  118. }
  119. }
  120. } else if subTypePriority == 2 {
  121. // 2 - ass/ssa
  122. if strings.ToLower(info.Ext) == common.SubExtASS || strings.ToLower(info.Ext) == common.SubExtSSA {
  123. // 优先双语
  124. if language.IsBilingualSubtitle(info.Lang) == true {
  125. return &info
  126. }
  127. }
  128. } else {
  129. // 优先双语
  130. if language.IsBilingualSubtitle(info.Lang) == true {
  131. return &info
  132. }
  133. }
  134. }
  135. }
  136. return nil
  137. }
  138. // SelectChineseBestSubtitle 找到合适的中文字幕,简体->繁体,以及 字幕类型的优先级选择
  139. func SelectChineseBestSubtitle(subs []subparser.FileInfo, subTypePriority int) *subparser.FileInfo {
  140. // 先傻一点实现优先双语的,之前的写法有 bug
  141. for _, info := range subs {
  142. // 找到了中文字幕
  143. if language.HasChineseLang(info.Lang) == true {
  144. // 字幕的优先级 0 - 原样, 1 - srt , 2 - ass/ssa
  145. if subTypePriority == 1 {
  146. // 1 - srt
  147. if strings.ToLower(info.Ext) == common.SubExtSRT {
  148. return &info
  149. }
  150. } else if subTypePriority == 2 {
  151. // 2 - ass/ssa
  152. if strings.ToLower(info.Ext) == common.SubExtASS || strings.ToLower(info.Ext) == common.SubExtSSA {
  153. return &info
  154. }
  155. } else {
  156. return &info
  157. }
  158. }
  159. }
  160. return nil
  161. }
  162. // GetFrontNameAndOrgName 返回的名称包含,那个网站下载的,这个网站中排名第几,文件名
  163. func GetFrontNameAndOrgName(info *supplier.SubInfo) string {
  164. infoName := ""
  165. fileName, err := decode.GetVideoInfoFromFileName(info.Name)
  166. if err != nil {
  167. log_helper.GetLogger().Warnln("", err)
  168. infoName = info.Name
  169. } else {
  170. infoName = fileName.Title + "_S" + strconv.Itoa(fileName.Season) + "E" + strconv.Itoa(fileName.Episode) + filepath.Ext(info.Name)
  171. }
  172. info.Name = infoName
  173. return "[" + info.FromWhere + "]_" + strconv.FormatInt(info.TopN, 10) + "_" + infoName
  174. }
  175. // AddFrontName 添加文件的前缀
  176. func AddFrontName(info supplier.SubInfo, orgName string) string {
  177. return "[" + info.FromWhere + "]_" + strconv.FormatInt(info.TopN, 10) + "_" + orgName
  178. }
  179. // SearchMatchedSubFileByDir 搜索符合后缀名的视频文件,排除 Sub_SxE0 这样的文件夹中的文件
  180. func SearchMatchedSubFileByDir(dir string) ([]string, error) {
  181. // 这里有个梗,会出现 __MACOSX 这类文件夹,那么里面会有一样的文件,需要用文件大小排除一下,至少大于 1 kb 吧
  182. var fileFullPathList = make([]string, 0)
  183. pathSep := string(os.PathSeparator)
  184. files, err := ioutil.ReadDir(dir)
  185. if err != nil {
  186. return nil, err
  187. }
  188. for _, curFile := range files {
  189. fullPath := dir + pathSep + curFile.Name()
  190. if curFile.IsDir() {
  191. // 需要排除 Sub_S1E0、Sub_S2E0 这样的整季的字幕文件夹,这里仅仅是缓存,不会被加载的
  192. matched := regOneSeasonSubFolderNameMatch.FindAllStringSubmatch(curFile.Name(), -1)
  193. if len(matched) > 0 {
  194. continue
  195. }
  196. // 内层的错误就无视了
  197. oneList, _ := SearchMatchedSubFileByDir(fullPath)
  198. if oneList != nil {
  199. fileFullPathList = append(fileFullPathList, oneList...)
  200. }
  201. } else {
  202. // 这里就是文件了
  203. if curFile.Size() < 1000 {
  204. continue
  205. }
  206. if sub_parser_hub.IsSubExtWanted(filepath.Ext(curFile.Name())) == true {
  207. fileFullPathList = append(fileFullPathList, fullPath)
  208. }
  209. }
  210. }
  211. return fileFullPathList, nil
  212. }
  213. // SearchMatchedSubFileByOneVideo 搜索这个视频当前目录下匹配的字幕
  214. func SearchMatchedSubFileByOneVideo(oneVideoFullPath string) ([]string, error) {
  215. dir := filepath.Dir(oneVideoFullPath)
  216. fileName := filepath.Base(oneVideoFullPath)
  217. fileName = strings.ToLower(fileName)
  218. fileName = strings.ReplaceAll(fileName, filepath.Ext(fileName), "")
  219. pathSep := string(os.PathSeparator)
  220. files, err := ioutil.ReadDir(dir)
  221. if err != nil {
  222. return nil, err
  223. }
  224. var matchedSubs = make([]string, 0)
  225. for _, curFile := range files {
  226. if curFile.IsDir() {
  227. continue
  228. }
  229. // 这里就是文件了
  230. if curFile.Size() < 1000 {
  231. continue
  232. }
  233. // 判断的时候用小写的,后续重命名的时候用原有的名称
  234. nowFileName := strings.ToLower(curFile.Name())
  235. // 后缀名得对
  236. if sub_parser_hub.IsSubExtWanted(filepath.Ext(nowFileName)) == false {
  237. continue
  238. }
  239. // 字幕文件名应该包含 视频文件名(无后缀)
  240. if strings.Contains(nowFileName, fileName) == false {
  241. continue
  242. }
  243. oldPath := dir + pathSep + curFile.Name()
  244. matchedSubs = append(matchedSubs, oldPath)
  245. }
  246. return matchedSubs, nil
  247. }
  248. // SearchVideoMatchSubFileAndRemoveExtMark 找到找个视频目录下相匹配的字幕,同时去除这些字幕中 .default 或者 .forced 的标记。注意这两个标记不应该同时出现,否则无法正确去除
  249. func SearchVideoMatchSubFileAndRemoveExtMark(oneVideoFullPath string) error {
  250. dir := filepath.Dir(oneVideoFullPath)
  251. fileName := filepath.Base(oneVideoFullPath)
  252. fileName = strings.ToLower(fileName)
  253. fileName = strings.ReplaceAll(fileName, filepath.Ext(fileName), "")
  254. pathSep := string(os.PathSeparator)
  255. files, err := ioutil.ReadDir(dir)
  256. if err != nil {
  257. return err
  258. }
  259. for _, curFile := range files {
  260. if curFile.IsDir() {
  261. continue
  262. } else {
  263. // 这里就是文件了
  264. if curFile.Size() < 1000 {
  265. continue
  266. }
  267. // 判断的时候用小写的,后续重命名的时候用原有的名称
  268. nowFileName := strings.ToLower(curFile.Name())
  269. // 后缀名得对
  270. if sub_parser_hub.IsSubExtWanted(filepath.Ext(nowFileName)) == false {
  271. continue
  272. }
  273. // 字幕文件名应该包含 视频文件名(无后缀)
  274. if strings.Contains(nowFileName, fileName) == false {
  275. continue
  276. }
  277. // 得包含 .default. 找个关键词
  278. if strings.Contains(nowFileName, subparser.Sub_Ext_Mark_Default+".") == true {
  279. oldPath := dir + pathSep + curFile.Name()
  280. newPath := dir + pathSep + strings.ReplaceAll(curFile.Name(), subparser.Sub_Ext_Mark_Default+".", ".")
  281. err = os.Rename(oldPath, newPath)
  282. if err != nil {
  283. return err
  284. }
  285. } else if strings.Contains(nowFileName, subparser.Sub_Ext_Mark_Forced+".") == true {
  286. // 得包含 .forced. 找个关键词
  287. oldPath := dir + pathSep + curFile.Name()
  288. newPath := dir + pathSep + strings.ReplaceAll(curFile.Name(), subparser.Sub_Ext_Mark_Forced+".", ".")
  289. err = os.Rename(oldPath, newPath)
  290. if err != nil {
  291. return err
  292. }
  293. } else {
  294. continue
  295. }
  296. }
  297. }
  298. return nil
  299. }
  300. // DeleteOneSeasonSubCacheFolder 删除一个连续剧中的所有一季字幕的缓存文件夹
  301. func DeleteOneSeasonSubCacheFolder(seriesDir string) error {
  302. files, err := ioutil.ReadDir(seriesDir)
  303. if err != nil {
  304. return err
  305. }
  306. pathSep := string(os.PathSeparator)
  307. for _, curFile := range files {
  308. if curFile.IsDir() == true {
  309. matched := regOneSeasonSubFolderNameMatch.FindAllStringSubmatch(curFile.Name(), -1)
  310. if matched == nil || len(matched) < 1 {
  311. continue
  312. }
  313. fullPath := seriesDir + pathSep + curFile.Name()
  314. err = os.RemoveAll(fullPath)
  315. if err != nil {
  316. return err
  317. }
  318. }
  319. }
  320. return nil
  321. }
  322. /*
  323. 只针对英文字幕进行合并分散的 Dialogues
  324. 会遇到这样的字幕,如下
  325. 2line-The Card Counter (2021) WEBDL-1080p.chinese(inside).ass
  326. 它的对白一句话分了两个 dialogue 去做。这样做后续字幕时间轴校正就会遇到问题,因为只有一半,匹配占比会很低
  327. (每一个 Dialogue 的首字母需要分析,大写和小写的占比是多少,统计一下,正常的,和上述特殊的)
  328. 那么,就需要额外的逻辑去对 DialoguesEx 进行额外的推断
  329. 暂时考虑的方案是,英文对白每一句的开头应该是英文大写字幕,如果是小写字幕,就应该与上语句合并,且每一句的字符长度有大于一定才触发
  330. */
  331. func MergeMultiDialogue4EngSubtitle(inSubParser *subparser.FileInfo) {
  332. merger := NewDialogueMerger()
  333. for _, dialogueEx := range inSubParser.DialoguesEx {
  334. merger.Add(dialogueEx)
  335. }
  336. inSubParser.DialoguesEx = merger.Get()
  337. }
  338. /*
  339. GetVADINfoFromSub
  340. 这里的字幕要求是完整的一个字幕
  341. 1. 抽取字幕的时间片段的时候,暂定,前 15% 和后 15% 要避开,前奏、主题曲、结尾曲
  342. 2. 将整个字幕,抽取连续 5 句对话为一个单元,提取时间片段信息
  343. */
  344. func GetVADINfoFromSub(infoSrc *subparser.FileInfo, FrontAndEndPer float64, SubUnitMaxCount int) ([]SubUnit, error) {
  345. if SubUnitMaxCount < 0 {
  346. SubUnitMaxCount = 0
  347. }
  348. srcSubUnitList := make([]SubUnit, 0)
  349. srcOneSubUnit := NewSubUnit()
  350. srcTimeFormat := infoSrc.GetTimeFormat()
  351. // srcDuration
  352. lastDialogueExTimeEnd, err := time.Parse(srcTimeFormat, infoSrc.DialoguesEx[len(infoSrc.DialoguesEx)-1].EndTime)
  353. if err != nil {
  354. return nil, err
  355. }
  356. srcDuration := my_util.Time2SecendNumber(lastDialogueExTimeEnd)
  357. for i := 0; i < len(infoSrc.DialoguesEx); i++ {
  358. oneDialogueExTimeStart, err := time.Parse(srcTimeFormat, infoSrc.DialoguesEx[i].StartTime)
  359. if err != nil {
  360. return nil, err
  361. }
  362. oneDialogueExTimeEnd, err := time.Parse(srcTimeFormat, infoSrc.DialoguesEx[i].EndTime)
  363. if err != nil {
  364. return nil, err
  365. }
  366. oneStart := my_util.Time2SecendNumber(oneDialogueExTimeStart)
  367. if FrontAndEndPer > 0 {
  368. if srcDuration*FrontAndEndPer > oneStart || srcDuration*(1.0-FrontAndEndPer) < oneStart {
  369. continue
  370. }
  371. }
  372. // 如果当前的这一句话,为空,或者进过正则表达式剔除特殊字符后为空,则跳过
  373. if my_util.ReplaceSpecString(infoSrc.GetDialogueExContent(i), "") == "" {
  374. continue
  375. }
  376. // 低于 5句对白,则添加
  377. if srcOneSubUnit.GetDialogueCount() < SubUnitMaxCount {
  378. srcOneSubUnit.AddAndInsert(oneDialogueExTimeStart, oneDialogueExTimeEnd)
  379. } else {
  380. srcSubUnitList = append(srcSubUnitList, *srcOneSubUnit)
  381. srcOneSubUnit = NewSubUnit()
  382. }
  383. }
  384. if srcOneSubUnit.GetDialogueCount() > 0 {
  385. srcSubUnitList = append(srcSubUnitList, *srcOneSubUnit)
  386. }
  387. return srcSubUnitList, nil
  388. }
  389. var (
  390. regOneSeasonSubFolderNameMatch = regexp.MustCompile(`(?m)^Sub_S\dE0`)
  391. )