Browse Source

解决压缩包内文件名称编码问题,以及字幕文件读取编码问题,可能还会遇到坑,再议

Signed-off-by: 716 <[email protected]>
716 4 years ago
parent
commit
a505c5d967

+ 0 - 1
common/selferr.go

@@ -10,7 +10,6 @@ var(
 	ShooterFileHashIsEmpty = errors.New("filehash is empty")
 
 	ZiMuKuSearchKeyWordStep0DetailPageUrlNotFound = errors.New("zimuku search keyword step0 not found, detail page url")
-	ZiMuKuSearchKeyWordStep1NotFound = errors.New("zimuku search keyword step1 not found")
 	ZiMuKuDownloadUrlStep2NotFound = errors.New("zimuku download url step2 not found")
 	ZiMuKuDownloadUrlStep3NotFound = errors.New("zimuku download url step3 not found")
 	ZiMuKuDownloadUrlStep3AllFailed = errors.New("zimuku download url step3 all failed")

+ 1 - 0
downloader.go

@@ -147,6 +147,7 @@ func (d Downloader) writeSubFile2VideoPath(videoFileFullPath string, finalSubFil
 	if err != nil {
 		return err
 	}
+	d.log.Infoln("OrgSubName:", finalSubFile.Name)
 	d.log.Infoln("SubDownAt:", desSubFullPath)
 	return nil
 }

+ 1 - 0
go.mod

@@ -5,6 +5,7 @@ go 1.15
 require (
 	github.com/PuerkitoBio/goquery v1.6.1
 	github.com/abadojack/whatlanggo v1.0.1
+	github.com/axgle/mahonia v0.0.0-20180208002826-3358181d7394
 	github.com/beevik/etree v1.1.0
 	github.com/gen2brain/go-unarr v0.1.1
 	github.com/go-resty/resty/v2 v2.6.0

+ 2 - 0
go.sum

@@ -28,6 +28,8 @@ github.com/andybalholm/cascadia v1.1.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9Pq
 github.com/armon/circbuf v0.0.0-20150827004946-bbbad097214e/go.mod h1:3U/XgcO3hCbHZ8TKRvWD2dDTCfh9M9ya+I9JpbB7O8o=
 github.com/armon/go-metrics v0.0.0-20180917152333-f0300d1749da/go.mod h1:Q73ZrmVTwzkszR9V5SSuryQ31EELlFMUz1kKyl939pY=
 github.com/armon/go-radix v0.0.0-20180808171621-7fddfc383310/go.mod h1:ufUuZ+zHj4x4TnLV4JWEpy2hxWSpsRywHrMgIH9cCH8=
+github.com/axgle/mahonia v0.0.0-20180208002826-3358181d7394 h1:OYA+5W64v3OgClL+IrOD63t4i/RW7RqrAVl9LTZ9UqQ=
+github.com/axgle/mahonia v0.0.0-20180208002826-3358181d7394/go.mod h1:Q8n74mJTIgjX4RBBcHnJ05h//6/k6foqmgE45jTQtxg=
 github.com/beevik/etree v1.1.0 h1:T0xke/WvNtMoCqgzPhkX2r4rjY3GDZFi+FjpRZY2Jbs=
 github.com/beevik/etree v1.1.0/go.mod h1:r8Aw8JqVegEf0w2fDnATrX9VpkMcyFeM0FhwO62wh+A=
 github.com/beorn7/perks v0.0.0-20180321164747-3a771d992973/go.mod h1:Dwedo/Wpr24TaqPxmxbtue+5NUziq4I4S80YR8gNf3Q=

+ 2 - 0
interface/iSubParser.go

@@ -4,6 +4,8 @@ import "github.com/allanpk716/ChineseSubFinder/common"
 
 type ISubParser interface {
 
+	GetParserName() string
+
 	DetermineFileTypeFromFile(filePath string) (*common.SubParserFileInfo, error)
 
 	DetermineFileTypeFromBytes(inBytes []byte, nowExt string) (*common.SubParserFileInfo, error)

+ 25 - 0
model/language.go

@@ -3,6 +3,8 @@ package model
 import (
 	"github.com/abadojack/whatlanggo"
 	"github.com/allanpk716/ChineseSubFinder/common"
+	"github.com/axgle/mahonia"
+	"github.com/saintfish/chardet"
 	"strings"
 )
 
@@ -225,4 +227,27 @@ func IsChineseSimpleOrTraditional(inputFileName string, orgLang common.Language)
 	}
 }
 
+// ConvertToString 将字符串从原始编码转换到目标编码,需要配合字符串检测编码库使用 chardet.NewTextDetector()
+func ConvertToString(src string, srcCode string, tagCode string) string {
+	srcCoder := mahonia.NewDecoder(srcCode)
+	srcResult := srcCoder.ConvertString(src)
+	tagCoder := mahonia.NewDecoder(tagCode)
+	_, cdata, _ := tagCoder.Translate([]byte(srcResult), true)
+	result := string(cdata)
+	return result
+}
 
+// ChangeFileCoding2UTF8 自动检测文件的编码,然后转换到 UTF-8
+func ChangeFileCoding2UTF8(inBytes []byte) ([]byte, error) {
+	detector := chardet.NewTextDetector()
+	result, err := detector.DetectBest(inBytes)
+	if err != nil {
+		return nil ,err
+	}
+	ouBytes := inBytes
+	if result.Charset != "UTF-8" {
+		ouString := ConvertToString(string(inBytes), result.Charset, "UTF-8")
+		ouBytes = []byte(ouString)
+	}
+	return ouBytes, nil
+}

+ 16 - 24
model/unarchiveFile.go

@@ -4,23 +4,24 @@ import (
 	"archive/tar"
 	"archive/zip"
 	"bytes"
+	"golang.org/x/text/encoding/simplifiedchinese"
+	"golang.org/x/text/transform"
+	"io/ioutil"
+
 	"compress/flate"
 	"errors"
-	"fmt"
 	"github.com/gen2brain/go-unarr"
 	"github.com/go-rod/rod/lib/utils"
 	"github.com/mholt/archiver/v3"
-	"github.com/saintfish/chardet"
-	"golang.org/x/text/encoding/simplifiedchinese"
-	"golang.org/x/text/transform"
 	"io"
-	"io/ioutil"
 	"path"
 	"path/filepath"
 	"strings"
 	"unicode/utf8"
 )
 
+// UnArchiveFile 7z 以外的都能搞定中文编码的问题,但是 7z 有梗,需要单独的库去解析,且编码是解决不了的,以后他们搞定了再测试
+// 所以效果就是,7z 外的压缩包文件解压ok,字幕可以正常从名称解析出是简体还是繁体,但是7z就没办法了,一定乱码
 func UnArchiveFile(fileFullPath, desRootPath string) error {
 	switch filepath.Ext(strings.ToLower(fileFullPath)) {
 	case ".zip":
@@ -38,13 +39,6 @@ func UnArchiveFile(fileFullPath, desRootPath string) error {
 			}
 			zfh, ok := f.Header.(zip.FileHeader)
 			if ok {
-				isUTF8 := utf8.Valid([]byte(zfh.Name))
-				if isUTF8 != zfh.NonUTF8 {
-					println("the same")
-				} else {
-					println("not the same")
-				}
-
 				err := processOneFile(f, zfh.NonUTF8, desRootPath)
 				if err != nil {
 					return err
@@ -55,7 +49,6 @@ func UnArchiveFile(fileFullPath, desRootPath string) error {
 		if err != nil {
 			return err
 		}
-
 	case ".tar":
 		z := archiver.Tar{
 			MkdirAll:               true,
@@ -102,6 +95,8 @@ func UnArchiveFile(fileFullPath, desRootPath string) error {
 		if err != nil {
 			return err
 		}
+	case ".7z":
+		return unArr7z(fileFullPath, desRootPath)
 	default:
 		return errors.New("not support un archive file ext")
 	}
@@ -110,21 +105,18 @@ func UnArchiveFile(fileFullPath, desRootPath string) error {
 }
 
 func processOneFile(f archiver.File, notUTF8 bool, desRootPath string) error {
-	detector := chardet.NewTextDetector()
 	decodeName := f.Name()
-	result, err := detector.DetectBest([]byte(decodeName))
-	if err != nil {
-		return err
-	}
-	fmt.Printf("Detected charset is %s, language is %s",
-		result.Charset,
-		result.Language)
-
 	if notUTF8 == true {
+
+		//ouBytes, err := ChangeFileCoding2UTF8([]byte(f.Name()))
+		//if err != nil {
+		//	return err
+		//}
 		i := bytes.NewReader([]byte(f.Name()))
 		decoder := transform.NewReader(i, simplifiedchinese.GB18030.NewDecoder())
 		content, _ := ioutil.ReadAll(decoder)
 		decodeName = string(content)
+		//decodeName = string(ouBytes)
 	}
 	var chunk []byte
 	buf := make([]byte, 1024)
@@ -140,14 +132,14 @@ func processOneFile(f archiver.File, notUTF8 bool, desRootPath string) error {
 		//读取到最终的缓冲区中
 		chunk = append(chunk, buf[:n]...)
 	}
-	err = utils.OutputFile(path.Join(desRootPath, decodeName), chunk)
+	err := utils.OutputFile(path.Join(desRootPath, decodeName), chunk)
 	if err != nil {
 		return err
 	}
 	return nil
 }
 
-func UnArr(fileFullPath, desRootPath string) error {
+func unArr7z(fileFullPath, desRootPath string) error {
 	a, err := unarr.NewArchive(fileFullPath)
 	if err != nil {
 		return err

+ 1 - 1
model/unarchiveFile_test.go

@@ -28,7 +28,7 @@ func TestUnArr(t *testing.T) {
 	file := "C:\\Tmp\\123.zip"
 	//file := "C:\\Tmp\\Tmp.7z"
 	//file := "C:\\Tmp\\[zimuku]_0_[zmk.pw]奥斯陆.Oslo.[WEB.1080P]中英文字幕.zip"
-	err := UnArr(file, desRoot)
+	err := unArr7z(file, desRoot)
 	if err != nil {
 		t.Fatal(err)
 	}

+ 9 - 1
sub_parser/ass/ass.go

@@ -16,6 +16,10 @@ func NewParser() *Parser {
 	return &Parser{}
 }
 
+func (p Parser) GetParserName() string {
+	return "ass"
+}
+
 // DetermineFileTypeFromFile 确定字幕文件的类型,是双语字幕或者某一种语言等等信息
 func (p Parser) DetermineFileTypeFromFile(filePath string) (*common.SubParserFileInfo, error) {
 	nowExt := filepath.Ext(filePath)
@@ -26,7 +30,11 @@ func (p Parser) DetermineFileTypeFromFile(filePath string) (*common.SubParserFil
 	if err != nil {
 		return nil ,err
 	}
-	return p.DetermineFileTypeFromBytes(fBytes, nowExt)
+	inBytes, err := model.ChangeFileCoding2UTF8(fBytes)
+	if err != nil {
+		return nil, err
+	}
+	return p.DetermineFileTypeFromBytes(inBytes, nowExt)
 }
 
 // DetermineFileTypeFromBytes 确定字幕文件的类型,是双语字幕或者某一种语言等等信息

+ 3 - 1
sub_parser/ass/ass_test.go

@@ -6,7 +6,9 @@ import (
 
 func TestParser_DetermineFileType(t *testing.T) {
 
-	filePath := "C:\\Tmp\\saw9.ass"
+	//filePath := "C:\\Tmp\\saw9.ass"
+	//filePath := "C:\\tmp\\[zimuku]_0_oslo.2021.1080p.web.h264-naisu.简体&英文.ass"
+	filePath := "C:\\tmp\\oslo.2021.1080p.web.h264-naisu.简体&英文.ass"
 	parser := NewParser()
 	sfi, err := parser.DetermineFileTypeFromFile(filePath)
 	if err != nil {

+ 9 - 2
sub_parser/srt/srt.go

@@ -17,6 +17,10 @@ func NewParser() *Parser {
 	return &Parser{}
 }
 
+func (p Parser) GetParserName() string {
+	return "srt"
+}
+
 // DetermineFileTypeFromFile 确定字幕文件的类型,是双语字幕或者某一种语言等等信息
 func (p Parser) DetermineFileTypeFromFile(filePath string) (*common.SubParserFileInfo, error) {
 	nowExt := filepath.Ext(filePath)
@@ -27,8 +31,11 @@ func (p Parser) DetermineFileTypeFromFile(filePath string) (*common.SubParserFil
 	if err != nil {
 		return nil ,err
 	}
-
-	return p.DetermineFileTypeFromBytes(fBytes, nowExt)
+	inBytes, err := model.ChangeFileCoding2UTF8(fBytes)
+	if err != nil {
+		return nil, err
+	}
+	return p.DetermineFileTypeFromBytes(inBytes, nowExt)
 }
 
 // DetermineFileTypeFromBytes 确定字幕文件的类型,是双语字幕或者某一种语言等等信息

+ 3 - 1
sub_parser/srt/srt_test.go

@@ -5,7 +5,9 @@ import (
 )
 
 func TestParser_DetermineFileType(t *testing.T) {
-	filePath := "C:\\Tmp\\saw9.srt"
+	//filePath := "C:\\Tmp\\saw9.srt"
+	//filePath := "C:\\tmp\\[zimuku]_0_oslo.2021.1080p.web.h264-naisu.简体&英文.srt"
+	filePath := "C:\\tmp\\oslo.2021.1080p.web.h264-naisu.简体&英文.srt"
 	parser := NewParser()
 	sfi, err := parser.DetermineFileTypeFromFile(filePath)
 	if err != nil {

+ 1 - 2
sub_supplier/subSupplierHub.go

@@ -5,7 +5,6 @@ import (
 	"github.com/allanpk716/ChineseSubFinder/interface"
 	"github.com/allanpk716/ChineseSubFinder/model"
 	"github.com/go-rod/rod/lib/utils"
-	"github.com/mholt/archiver/v3"
 	"github.com/sirupsen/logrus"
 	"io/ioutil"
 	"os"
@@ -124,7 +123,7 @@ func (d SubSupplierHub) organizeDlSubFiles(subInfos []common.SupplierSubInfo) ([
 			// 那么就是需要解压的文件了
 			// 解压,给一个单独的文件夹
 			unzipTmpFolder := path.Join(tmpFolderFullPath, subInfo.FromWhere)
-			err = archiver.Unarchive(nowFileSaveFullPath, unzipTmpFolder)
+			err = model.UnArchiveFile(nowFileSaveFullPath, unzipTmpFolder)
 			// 解压完成后,遍历受支持的字幕列表,加入缓存列表
 			if err != nil {
 				d.log.Errorln("archiver.Unarchive", subInfo.FromWhere, subInfo.Name, subInfo.TopN, err)

+ 6 - 67
sub_supplier/subhd/subhd.go

@@ -64,11 +64,7 @@ func (s Supplier) GetSubListFromFile(filePath string) ([]common.SupplierSubInfo,
 	imdbId, err := model.GetImdbId(fileRootDirPath)
 	if err != nil {
 		// 允许的错误,跳过,继续进行文件名的搜索
-		if err == common.CanNotFindIMDBID {
-			s.log.Error(err)
-		} else {
-			return nil, err
-		}
+		s.log.Error(err)
 	}
 
 	var subInfoList []common.SupplierSubInfo
@@ -77,7 +73,8 @@ func (s Supplier) GetSubListFromFile(filePath string) ([]common.SupplierSubInfo,
 		// 先用 imdb id 找
 		subInfoList, err = s.GetSubListFromKeyword(imdbId)
 		if err != nil {
-			return nil, err
+			// 允许的错误,跳过,继续进行文件名的搜索
+			s.log.Error(err)
 		}
 		// 如果有就优先返回
 		if len(subInfoList) >0 {
@@ -169,9 +166,9 @@ func (s Supplier) Step1(detailPageUrl string) ([]HdListItem, error) {
 	}
 	var lists []HdListItem
 
-	const subTableKeyword = ".table-sm tr"
-	const oneSubTrTitleKeyword = "a.text-dark"
-	const oneSubTrDownloadCountKeyword = "td.p-3"
+	const subTableKeyword = ".pt-2"
+	const oneSubTrTitleKeyword = "a.link-dark"
+	const oneSubTrDownloadCountKeyword = "div.px-3"
 	const oneSubLangAndTypeKetword = ".text-secondary"
 
 	doc.Find(subTableKeyword).EachWithBreak(func(i int, tr *goquery.Selection) bool {
@@ -213,64 +210,6 @@ func (s Supplier) Step1(detailPageUrl string) ([]HdListItem, error) {
 
 	return lists, nil
 }
-// Step2 下载字幕,没用了,弃了
-func (s Supplier) Step2(subDownloadPageUrl string) (*HdContent, error) {
-	subDownloadPageUrl = model.AddBaseUrl(common.SubSubHDRootUrl, subDownloadPageUrl)
-	result, err := s.httpGet(subDownloadPageUrl)
-	if err != nil {
-		return nil, err
-	}
-	doc, err := goquery.NewDocumentFromReader(strings.NewReader(result))
-	if err != nil {
-		return nil, err
-	}
-	// 是否有腾讯的防水墙
-	matchList := doc.Find("#TencentCaptcha")
-	if len(matchList.Nodes) < 1 {
-		s.log.Debug("find fang shui qiang")
-	}
-	//matchList = doc.Find("#down")
-	//if len(matchList.Nodes) < 1 {
-	//	println("not found down")
-	//}
-	postData := make(map[string]string)
-	sid, exists := matchList.Attr("sid")
-	if !exists {
-		return nil, common.SubHDStep2SidIsNull
-	}
-	postData["sub_id"] = sid
-	dToken, exists := matchList.Attr("dtoken1")
-	if !exists {
-		return nil, common.SubHDStep2DTokenIsNull
-	}
-	postData["dtoken1"] = dToken
-	url2 := fmt.Sprintf("%s%s", common.SubSubHDRootUrl, "/ajax/down_ajax")
-	result, err = s.httpPost(url2, postData, subDownloadPageUrl)
-	if err != nil {
-		return nil, err
-	}
-	if result == "" || strings.Contains(result, "true") == false {
-		return nil, common.SubHDStep2ResultIsNullOrNotTrue
-	}
-	reg := regexp.MustCompile(`"url":"([^"]+)"`)
-	arr := reg.FindStringSubmatch(result)
-	if len(arr) == 0 {
-		return nil, common.SubHDStep2PostResultGetUrlNotFound
-	}
-	downUrl := arr[1]
-	downUrl = strings.ReplaceAll(downUrl, "\\", "")
-	var filename = filepath.Base(downUrl)
-	var data []byte
-	data, filename, err = model.DownFile(downUrl, s.reqParam)
-	if err != nil {
-		return nil, err
-	}
-	return &HdContent{
-		Filename: filename,
-		Ext:      strings.ToLower(filepath.Ext(filename)),
-		Data:     data,
-	}, nil
-}
 
 // Step2Ex 下载字幕 过防水墙
 func (s Supplier) Step2Ex(browser *rod.Browser, subDownloadPageUrl string) (*HdContent, error)  {

+ 3 - 3
sub_supplier/subhd/subhd_test.go

@@ -1,17 +1,17 @@
 package subhd
 
 import (
-	"github.com/allanpk716/ChineseSubFinder/model"
 	"testing"
 )
 
 func TestSupplier_GetSubListFromFile(t *testing.T) {
 
-	movie1 := "X:\\电影\\Spiral From the Book of Saw (2021)\\Spiral From the Book of Saw (2021) WEBDL-1080p.mkv"
+	movie1 := "X:\\电影\\Oslo (2021)\\Oslo (2021) WEBDL-1080p.mkv"
+	//movie1 := "X:\\电影\\Spiral From the Book of Saw (2021)\\Spiral From the Book of Saw (2021) WEBDL-1080p.mkv"
 	//movie1 := "X:\\电影\\消失爱人 (2016)\\消失爱人 (2016) 720p AAC.rmvb"
 	//movie1 := "X:\\电影\\机动战士Z高达:星之继承者 (2005)\\机动战士Z高达:星之继承者 (2005) 1080p TrueHD.mkv"
 	//movie1 := "X:\\连续剧\\The Bad Batch\\Season 1\\The Bad Batch - S01E01 - Aftermath WEBDL-1080p.mkv"
-	shooter := NewSupplier(common.ReqParam{Topic: 3})
+	shooter := NewSupplier()
 	outList, err := shooter.GetSubListFromFile(movie1)
 	if err != nil {
 		t.Error(err)

+ 6 - 9
sub_supplier/zimuku/zimuku.go

@@ -54,11 +54,7 @@ func (s Supplier) GetSubListFromFile(filePath string) ([]common.SupplierSubInfo,
 	imdbId, err := model.GetImdbId(fileRootDirPath)
 	if err != nil {
 		// 允许的错误,跳过,继续进行文件名的搜索
-		if err == common.CanNotFindIMDBID {
-			s.log.Error(err)
-		} else {
-			return nil, err
-		}
+		s.log.Error(err)
 	}
 
 	var subInfoList []common.SupplierSubInfo
@@ -67,7 +63,8 @@ func (s Supplier) GetSubListFromFile(filePath string) ([]common.SupplierSubInfo,
 		// 先用 imdb id 找
 		subInfoList, err = s.GetSubListFromKeyword(imdbId)
 		if err != nil {
-			return nil, err
+			// 允许的错误,跳过,继续进行文件名的搜索
+			s.log.Error(err)
 		}
 		// 如果有就优先返回
 		if len(subInfoList) >0 {
@@ -146,7 +143,7 @@ func (s Supplier) GetSubListFromKeyword(keyword string) ([]common.SupplierSubInf
 	return outSubInfoList, nil
 }
 
-// Step0 先在查询界面找到字幕对应第一个影片的详情界面
+// Step0 先在查询界面找到字幕对应第一个影片的详情界面,需要解决自定义错误 ZiMuKuSearchKeyWordStep0DetailPageUrlNotFound
 func (s Supplier) Step0(keyword string) (string, error) {
 	httpClient := model.NewHttpClient(s.reqParam)
 	// 第一级界面,有多少个字幕
@@ -254,7 +251,7 @@ func (s Supplier) Step1(filmDetailPageUrl string) (SubResult, error) {
 	return subResult, nil
 }
 
-// Step2 第二级界面,单个字幕详情
+// Step2 第二级界面,单个字幕详情,需要判断 ZiMuKuDownloadUrlStep2NotFound 这个自定义错误
 func (s Supplier) Step2(subInfo *SubInfo) error {
 
 	detailUrl := model.AddBaseUrl(common.SubZiMuKuRootUrl, subInfo.DetailUrl)
@@ -279,7 +276,7 @@ func (s Supplier) Step2(subInfo *SubInfo) error {
 	return nil
 }
 
-// Step3 第三级界面,具体字幕下载
+// Step3 第三级界面,具体字幕下载 ZiMuKuDownloadUrlStep3NotFound ZiMuKuDownloadUrlStep3AllFailed
 func (s Supplier) Step3(subDownloadPageUrl string) (string, []byte, error) {
 
 	subDownloadPageUrl = model.AddBaseUrl(common.SubZiMuKuRootUrl, subDownloadPageUrl)