Encode.cpp 8.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399
  1. #include "Encode.h"
  2. #include <QTextCodec>
  3. #include <QtDebug>
  4. /* 检查字符串编码的类。看了大量文献,结论如下:
  5. *如果是UTF BOM格式,或者UNICODE格式,其文件头部前几个字节(2-3)有一定的标识。由此标识直接按对应编码处理。
  6. *如果没有标识,默认就是UTF8(NO BOM) 与 ANSI(现在只考虑GBK)进行对比。
  7. *此时需要做统计分析。对所有行进行UTF8解析,如果按照UTF8解析错位再按照GBK解析。如果解析出GBK那么大概率认为文件是GBK编码的。
  8. */
  9. Encode::Encode()
  10. {
  11. }
  12. Encode::~Encode()
  13. {
  14. }
  15. CODE_ID Encode::getCodeByName(QString name)
  16. {
  17. CODE_ID id;
  18. if (name == "unknown")
  19. {
  20. id = CODE_ID::UNKOWN;
  21. }
  22. else if (name == "UTF16-LE")
  23. {
  24. id = CODE_ID::UNICODE_LE;
  25. }
  26. else if (name == "UTF16-BE")
  27. {
  28. id = CODE_ID::UNICODE_BE;
  29. }
  30. else if (name == "UTF8")
  31. {
  32. id = CODE_ID::UTF8_NOBOM;
  33. }
  34. else if (name == "UTF8-BOM")
  35. {
  36. id = CODE_ID::UTF8_BOM;
  37. }
  38. else if (name == "GBK")
  39. {
  40. id = CODE_ID::GBK;
  41. }
  42. else if (name == "EUC-JP")
  43. {
  44. id = CODE_ID::EUC_JP;
  45. }
  46. else if (name == "Shift-JIS")
  47. {
  48. id = CODE_ID::Shift_JIS;
  49. }
  50. else if (name == "EUC-KR")
  51. {
  52. id = CODE_ID::EUC_KR;
  53. }
  54. else if (name == "KOI8-R")
  55. {
  56. id = CODE_ID::KOI8_R;
  57. }
  58. else if (name == "TSCII")
  59. {
  60. id = CODE_ID::TSCII;
  61. }
  62. else if (name == "TIS-620")
  63. {
  64. id = CODE_ID::TIS_620;
  65. }
  66. else
  67. {
  68. id = CODE_ID::UNKOWN;
  69. }
  70. return id;
  71. }
  72. QString Encode::getLineEndById(RC_LINE_FORM id)
  73. {
  74. QString ret;
  75. switch (id)
  76. {
  77. case PAD_LINE:
  78. case UNKNOWN_LINE:
  79. #ifdef WIN32
  80. ret = "Windows(CR LF)";
  81. #else
  82. ret = "Unix(LF)";
  83. #endif
  84. ret = "NULL";
  85. break;
  86. case UNIX_LINE:
  87. ret = "Unix(LF)";
  88. break;
  89. case DOS_LINE:
  90. ret = "Windows(CR LF)";
  91. break;
  92. case MAC_LINE:
  93. ret = "Mac(CR)";
  94. break;
  95. default:
  96. break;
  97. }
  98. return ret;
  99. }
  100. QString Encode::getCodeNameById(CODE_ID id)
  101. {
  102. QString ret;
  103. switch (id)
  104. {
  105. case UNKOWN:
  106. ret = "unknown";
  107. break;
  108. case ANSI:
  109. ret = "unknown";
  110. break;
  111. case UNICODE_LE:
  112. ret = "UTF16-LE";
  113. break;
  114. case UNICODE_BE:
  115. ret = "UTF16-BE";
  116. break;
  117. case UTF8_NOBOM:
  118. ret = "UTF8";
  119. break;
  120. case UTF8_BOM:
  121. ret = "UTF8-BOM";
  122. break;
  123. case GBK:
  124. ret = "GBK";
  125. break;
  126. case EUC_JP:
  127. ret = "EUC-JP";
  128. break;
  129. case Shift_JIS:
  130. ret = "Shift-JIS";
  131. break;
  132. case EUC_KR:
  133. ret = "EUC-KR";
  134. break;
  135. case KOI8_R:
  136. ret = "KOI8-R";
  137. break;
  138. case TSCII:
  139. ret = "TSCII";
  140. break;
  141. case TIS_620:
  142. ret = "TIS-620";
  143. break;
  144. default:
  145. ret = "unknown";
  146. break;
  147. }
  148. return ret;
  149. }
  150. QByteArray Encode::getEncodeStartFlagByte(CODE_ID code)
  151. {
  152. QByteArray ret;
  153. switch (code)
  154. {
  155. case UNICODE_LE:
  156. {
  157. ret.append((char)0xFF);
  158. ret.append((char)0xFE);
  159. }
  160. break;
  161. case UNICODE_BE:
  162. {
  163. ret.append((char)0xFE);
  164. ret.append((char)0xFF);
  165. }
  166. break;
  167. case UTF8_BOM:
  168. {
  169. ret.append((char)0xEF);
  170. ret.append((char)0xBB);
  171. ret.append((char)0xBF);
  172. }
  173. break;
  174. default:
  175. break;
  176. }
  177. return ret;
  178. }
  179. CODE_ID Encode::DetectEncode(const uchar* pBuffer, int length, int &skip)
  180. {
  181. if (pBuffer[0] == 0xFF && pBuffer[1] == 0xFE)
  182. {
  183. skip = 2;
  184. return CODE_ID::UNICODE_LE; //skip 2
  185. }
  186. if (pBuffer[0] == 0xFE && pBuffer[1] == 0xFF)
  187. {
  188. skip = 2;
  189. return CODE_ID::UNICODE_BE; //skip 2
  190. }
  191. if (pBuffer[0] == 0xEF && pBuffer[1] == 0xBB && pBuffer[2] == 0xBF)
  192. {
  193. skip = 3;
  194. return CODE_ID::UTF8_BOM; //skip 3 with BOM
  195. }
  196. // 不能知道是不是UTF8
  197. CODE_ID code = CheckUnicodeWithoutBOM(pBuffer, length);
  198. skip = 0;
  199. return code; //skip 0
  200. }
  201. bool Encode::tranGbkToUNICODE(const char* pText, int length, QString &out)
  202. {
  203. QTextCodec::ConverterState state;
  204. QTextCodec *codec = QTextCodec::codecForName("GBK");
  205. out = codec->toUnicode((const char *)pText, length, &state);
  206. if (state.invalidChars > 0) {
  207. return false;
  208. }
  209. return true;
  210. }
  211. bool Encode::tranUtf8ToUNICODE(const char* pText, int length, QString &out)
  212. {
  213. QTextCodec::ConverterState state;
  214. QTextCodec *codec = QTextCodec::codecForName("UTF-8");
  215. out = codec->toUnicode((const char *)pText, length, &state);
  216. if (state.invalidChars > 0) {
  217. return false;
  218. }
  219. return true;
  220. }
  221. //与getCodeNameById类似,但是返回的是QT系统支持的编码的字符串名称
  222. QString Encode::getQtCodecNameById(CODE_ID id)
  223. {
  224. QString ret;
  225. switch (id)
  226. {
  227. case UNKOWN:
  228. case ANSI:
  229. ret = "unknown";
  230. break;
  231. case UNICODE_LE:
  232. ret = "UTF16-LE";
  233. break;
  234. case UNICODE_BE:
  235. ret = "UTF16-BE";
  236. break;
  237. case UTF8_NOBOM://qt没有这种
  238. case UTF8_BOM:
  239. ret = "UTF8";
  240. break;
  241. case GBK:
  242. ret = "GBK";
  243. break;
  244. case EUC_JP:
  245. ret = "EUC-JP";
  246. break;
  247. case Shift_JIS:
  248. ret = "Shift-JIS";
  249. break;
  250. case EUC_KR:
  251. ret = "EUC-KR";
  252. break;
  253. case KOI8_R:
  254. ret = "KOI8-R";
  255. break;
  256. case TSCII:
  257. ret = "TSCII";
  258. break;
  259. case TIS_620:
  260. ret = "TIS-620";
  261. break;
  262. default:
  263. ret = "unknown";
  264. break;
  265. }
  266. return ret;
  267. }
  268. //将指定编码的字符串转换到unicode
  269. bool Encode::tranStrToUNICODE(CODE_ID code, const char* pText, int length, QString &out)
  270. {
  271. if (length < 0)
  272. {
  273. return false;
  274. }
  275. QTextCodec::ConverterState state;
  276. QTextCodec *codec = nullptr;
  277. QString textCodeName = getQtCodecNameById(code);
  278. if (textCodeName.isEmpty() || textCodeName == "unknown")
  279. {
  280. //对于其它非识别编码,统一转换为utf8。减去让用户选择的麻烦
  281. //这里其实是有问题的。先这样简单处理
  282. codec = QTextCodec::codecForName("UTF-8");
  283. }
  284. else
  285. {
  286. codec = QTextCodec::codecForName(textCodeName.toStdString().c_str());
  287. }
  288. if (codec == nullptr)
  289. {
  290. return false;
  291. }
  292. out = codec->toUnicode((const char *)pText, length, &state);
  293. if (state.invalidChars > 0) {
  294. return false;
  295. }
  296. return true;
  297. }
  298. /* 这里其实是穷举字符串的字符编码;ASNI utf8。目前只检测GBK和utf8;其它语种没有穷举
  299. *GB2312 GBK GB18030 三种差别见https://cloud.tencent.com/developer/article/1343240
  300. *关于编码的详细说明,见https://blog.csdn.net/libaineu2004/article/details/19245205
  301. */
  302. //这里是有限检查utf8的,如果出现gbk,说明一定不是utf8,因为utf8检查到错误码。
  303. CODE_ID Encode::CheckUnicodeWithoutBOM(const uchar* pText, int length)
  304. {
  305. QTextCodec::ConverterState state;
  306. QTextCodec *codec = QTextCodec::codecForName("UTF-8");
  307. const QString text = codec->toUnicode((const char *)pText, length, &state);
  308. if (state.invalidChars > 0) {
  309. /*不是UTF-8格式的文件,这里优先判断是不是UTF8,再判断是不是GBK;我们先做中文版;如果后续要做
  310. *国际版,其实不应该只检查GBK,而是应该检查本地ASCI码,包括ascii码*/
  311. QTextCodec::ConverterState state1;
  312. QTextCodec *codec1 = QTextCodec::codecForName("GBK");
  313. codec1->toUnicode((const char *)pText, length, &state1);
  314. if (state1.invalidChars > 0) {
  315. return CODE_ID::ANSI;
  316. }
  317. else
  318. {
  319. return CODE_ID::GBK;
  320. }
  321. }
  322. return CODE_ID::UTF8_NOBOM;
  323. }
  324. CODE_ID Encode::CheckUnicodeWithoutBOM(const uchar* pText, int length, QString &outUnicodeText)
  325. {
  326. QTextCodec::ConverterState state;
  327. QTextCodec *codec = QTextCodec::codecForName("UTF-8");
  328. outUnicodeText = codec->toUnicode((const char *)pText, length, &state);
  329. if (state.invalidChars > 0) {
  330. /*不是UTF-8格式的文件,这里优先判断是不是UTF8,再判断是不是GBK;我们先做中文版;如果后续要做
  331. *国际版,其实不应该只检查GBK,而是因为检查本地ASCI码,包括ascii码*/
  332. QTextCodec::ConverterState state1;
  333. QTextCodec *codec1 = QTextCodec::codecForName("GBK");
  334. QString gbkStr = codec1->toUnicode((const char *)pText, length, &state1);
  335. if (state1.invalidChars > 0) {
  336. //如果也不是gbk,姑且按照utf8直接返回
  337. return CODE_ID::ANSI;
  338. }
  339. else
  340. {
  341. outUnicodeText = gbkStr;
  342. return CODE_ID::GBK;
  343. }
  344. }
  345. return CODE_ID::UTF8_NOBOM;
  346. }
  347. //检查是否全是ascii字符码
  348. bool Encode::CheckTextIsAllAscii(const uchar* pText, int length)
  349. {
  350. for (int i = 0; i < length; ++i)
  351. {
  352. if (*(pText + i) < 0 || *(pText + i) > 0x7F)
  353. {
  354. return false;
  355. }
  356. }
  357. return true;
  358. }