123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399 |
- #include "Encode.h"
- #include <QTextCodec>
- #include <QtDebug>
- /* 检查字符串编码的类。看了大量文献,结论如下:
- *如果是UTF BOM格式,或者UNICODE格式,其文件头部前几个字节(2-3)有一定的标识。由此标识直接按对应编码处理。
- *如果没有标识,默认就是UTF8(NO BOM) 与 ANSI(现在只考虑GBK)进行对比。
- *此时需要做统计分析。对所有行进行UTF8解析,如果按照UTF8解析错位再按照GBK解析。如果解析出GBK那么大概率认为文件是GBK编码的。
- */
- Encode::Encode()
- {
- }
- Encode::~Encode()
- {
- }
- CODE_ID Encode::getCodeByName(QString name)
- {
- CODE_ID id;
- if (name == "unknown")
- {
- id = CODE_ID::UNKOWN;
- }
- else if (name == "UTF16-LE")
- {
- id = CODE_ID::UNICODE_LE;
- }
- else if (name == "UTF16-BE")
- {
- id = CODE_ID::UNICODE_BE;
- }
- else if (name == "UTF8")
- {
- id = CODE_ID::UTF8_NOBOM;
- }
- else if (name == "UTF8-BOM")
- {
- id = CODE_ID::UTF8_BOM;
- }
- else if (name == "GBK")
- {
- id = CODE_ID::GBK;
- }
- else if (name == "EUC-JP")
- {
- id = CODE_ID::EUC_JP;
- }
- else if (name == "Shift-JIS")
- {
- id = CODE_ID::Shift_JIS;
- }
- else if (name == "EUC-KR")
- {
- id = CODE_ID::EUC_KR;
- }
- else if (name == "KOI8-R")
- {
- id = CODE_ID::KOI8_R;
- }
- else if (name == "TSCII")
- {
- id = CODE_ID::TSCII;
- }
- else if (name == "TIS-620")
- {
- id = CODE_ID::TIS_620;
- }
- else
- {
- id = CODE_ID::UNKOWN;
- }
- return id;
- }
- QString Encode::getLineEndById(RC_LINE_FORM id)
- {
- QString ret;
- switch (id)
- {
- case PAD_LINE:
- case UNKNOWN_LINE:
- #ifdef WIN32
- ret = "Windows(CR LF)";
- #else
- ret = "Unix(LF)";
- #endif
- ret = "NULL";
- break;
- case UNIX_LINE:
- ret = "Unix(LF)";
- break;
- case DOS_LINE:
- ret = "Windows(CR LF)";
- break;
- case MAC_LINE:
- ret = "Mac(CR)";
- break;
- default:
- break;
- }
- return ret;
- }
- QString Encode::getCodeNameById(CODE_ID id)
- {
- QString ret;
- switch (id)
- {
- case UNKOWN:
- ret = "unknown";
- break;
- case ANSI:
- ret = "unknown";
- break;
- case UNICODE_LE:
- ret = "UTF16-LE";
- break;
- case UNICODE_BE:
- ret = "UTF16-BE";
- break;
- case UTF8_NOBOM:
- ret = "UTF8";
- break;
- case UTF8_BOM:
- ret = "UTF8-BOM";
- break;
- case GBK:
- ret = "GBK";
- break;
- case EUC_JP:
- ret = "EUC-JP";
- break;
- case Shift_JIS:
- ret = "Shift-JIS";
- break;
- case EUC_KR:
- ret = "EUC-KR";
- break;
- case KOI8_R:
- ret = "KOI8-R";
- break;
- case TSCII:
- ret = "TSCII";
- break;
- case TIS_620:
- ret = "TIS-620";
- break;
- default:
- ret = "unknown";
- break;
- }
- return ret;
- }
- QByteArray Encode::getEncodeStartFlagByte(CODE_ID code)
- {
- QByteArray ret;
- switch (code)
- {
- case UNICODE_LE:
- {
- ret.append((char)0xFF);
- ret.append((char)0xFE);
- }
- break;
- case UNICODE_BE:
- {
- ret.append((char)0xFE);
- ret.append((char)0xFF);
- }
- break;
- case UTF8_BOM:
- {
- ret.append((char)0xEF);
- ret.append((char)0xBB);
- ret.append((char)0xBF);
- }
- break;
- default:
- break;
- }
- return ret;
- }
- CODE_ID Encode::DetectEncode(const uchar* pBuffer, int length, int &skip)
- {
- if (pBuffer[0] == 0xFF && pBuffer[1] == 0xFE)
- {
- skip = 2;
- return CODE_ID::UNICODE_LE; //skip 2
- }
- if (pBuffer[0] == 0xFE && pBuffer[1] == 0xFF)
- {
- skip = 2;
- return CODE_ID::UNICODE_BE; //skip 2
- }
- if (pBuffer[0] == 0xEF && pBuffer[1] == 0xBB && pBuffer[2] == 0xBF)
- {
- skip = 3;
- return CODE_ID::UTF8_BOM; //skip 3 with BOM
- }
- // 不能知道是不是UTF8
- CODE_ID code = CheckUnicodeWithoutBOM(pBuffer, length);
- skip = 0;
- return code; //skip 0
- }
- bool Encode::tranGbkToUNICODE(const char* pText, int length, QString &out)
- {
- QTextCodec::ConverterState state;
-
- QTextCodec *codec = QTextCodec::codecForName("GBK");
- out = codec->toUnicode((const char *)pText, length, &state);
- if (state.invalidChars > 0) {
- return false;
- }
- return true;
- }
- bool Encode::tranUtf8ToUNICODE(const char* pText, int length, QString &out)
- {
- QTextCodec::ConverterState state;
- QTextCodec *codec = QTextCodec::codecForName("UTF-8");
- out = codec->toUnicode((const char *)pText, length, &state);
- if (state.invalidChars > 0) {
- return false;
- }
- return true;
- }
- //与getCodeNameById类似,但是返回的是QT系统支持的编码的字符串名称
- QString Encode::getQtCodecNameById(CODE_ID id)
- {
- QString ret;
- switch (id)
- {
- case UNKOWN:
- case ANSI:
- ret = "unknown";
- break;
- case UNICODE_LE:
- ret = "UTF16-LE";
- break;
- case UNICODE_BE:
- ret = "UTF16-BE";
- break;
- case UTF8_NOBOM://qt没有这种
- case UTF8_BOM:
- ret = "UTF8";
- break;
- case GBK:
- ret = "GBK";
- break;
- case EUC_JP:
- ret = "EUC-JP";
- break;
- case Shift_JIS:
- ret = "Shift-JIS";
- break;
- case EUC_KR:
- ret = "EUC-KR";
- break;
- case KOI8_R:
- ret = "KOI8-R";
- break;
- case TSCII:
- ret = "TSCII";
- break;
- case TIS_620:
- ret = "TIS-620";
- break;
- default:
- ret = "unknown";
- break;
- }
- return ret;
- }
- //将指定编码的字符串转换到unicode
- bool Encode::tranStrToUNICODE(CODE_ID code, const char* pText, int length, QString &out)
- {
- if (length < 0)
- {
- return false;
- }
- QTextCodec::ConverterState state;
- QTextCodec *codec = nullptr;
- QString textCodeName = getQtCodecNameById(code);
- if (textCodeName.isEmpty() || textCodeName == "unknown")
- {
- //对于其它非识别编码,统一转换为utf8。减去让用户选择的麻烦
- //这里其实是有问题的。先这样简单处理
- codec = QTextCodec::codecForName("UTF-8");
- }
- else
- {
- codec = QTextCodec::codecForName(textCodeName.toStdString().c_str());
- }
-
- if (codec == nullptr)
- {
- return false;
- }
- out = codec->toUnicode((const char *)pText, length, &state);
- if (state.invalidChars > 0) {
- return false;
- }
- return true;
- }
- /* 这里其实是穷举字符串的字符编码;ASNI utf8。目前只检测GBK和utf8;其它语种没有穷举
- *GB2312 GBK GB18030 三种差别见https://cloud.tencent.com/developer/article/1343240
- *关于编码的详细说明,见https://blog.csdn.net/libaineu2004/article/details/19245205
- */
- //这里是有限检查utf8的,如果出现gbk,说明一定不是utf8,因为utf8检查到错误码。
- CODE_ID Encode::CheckUnicodeWithoutBOM(const uchar* pText, int length)
- {
- QTextCodec::ConverterState state;
- QTextCodec *codec = QTextCodec::codecForName("UTF-8");
- const QString text = codec->toUnicode((const char *)pText, length, &state);
- if (state.invalidChars > 0) {
- /*不是UTF-8格式的文件,这里优先判断是不是UTF8,再判断是不是GBK;我们先做中文版;如果后续要做
- *国际版,其实不应该只检查GBK,而是应该检查本地ASCI码,包括ascii码*/
- QTextCodec::ConverterState state1;
- QTextCodec *codec1 = QTextCodec::codecForName("GBK");
- codec1->toUnicode((const char *)pText, length, &state1);
- if (state1.invalidChars > 0) {
- return CODE_ID::ANSI;
- }
- else
- {
- return CODE_ID::GBK;
- }
- }
- return CODE_ID::UTF8_NOBOM;
- }
- CODE_ID Encode::CheckUnicodeWithoutBOM(const uchar* pText, int length, QString &outUnicodeText)
- {
- QTextCodec::ConverterState state;
- QTextCodec *codec = QTextCodec::codecForName("UTF-8");
- outUnicodeText = codec->toUnicode((const char *)pText, length, &state);
- if (state.invalidChars > 0) {
- /*不是UTF-8格式的文件,这里优先判断是不是UTF8,再判断是不是GBK;我们先做中文版;如果后续要做
- *国际版,其实不应该只检查GBK,而是因为检查本地ASCI码,包括ascii码*/
- QTextCodec::ConverterState state1;
- QTextCodec *codec1 = QTextCodec::codecForName("GBK");
- QString gbkStr = codec1->toUnicode((const char *)pText, length, &state1);
- if (state1.invalidChars > 0) {
- //如果也不是gbk,姑且按照utf8直接返回
- return CODE_ID::ANSI;
- }
- else
- {
- outUnicodeText = gbkStr;
- return CODE_ID::GBK;
- }
- }
- return CODE_ID::UTF8_NOBOM;
- }
- //检查是否全是ascii字符码
- bool Encode::CheckTextIsAllAscii(const uchar* pText, int length)
- {
- for (int i = 0; i < length; ++i)
- {
- if (*(pText + i) < 0 || *(pText + i) > 0x7F)
- {
- return false;
- }
- }
- return true;
- }
|