Pārlūkot izejas kodu

PegParser: fix Unicode characters above 65535

QString() will use two characters to save one Unicode chartacter if its
code point is above 65535, which causes the parse result shift.
Le Tan 7 gadi atpakaļ
vecāks
revīzija
8dbcf139d8
1 mainītis faili ar 137 papildinājumiem un 0 dzēšanām
  1. 137 0
      src/pegparser.cpp

+ 137 - 0
src/pegparser.cpp

@@ -396,6 +396,131 @@ QVector<VElementRegion> PegParser::parseImageRegions(const QSharedPointer<PegPar
     return regs;
 }
 
+#define MAX_CODE_POINT 65535
+
+#define X_CHAR 86U
+
+#define HAS_UTF8_BOM(x)         ( ((*x & 0xFF) == 0xEF)\
+                                  && ((*(x+1) & 0xFF) == 0xBB)\
+                                  && ((*(x+2) & 0xFF) == 0xBF) )
+
+// Calculate the UTF8 code point.
+// Return the number of chars consumed.
+static inline int utf8CodePoint(const char *p_ch, int &p_codePoint)
+{
+    unsigned char uch = *p_ch;
+
+    if ((uch & 0x80) == 0) {
+        p_codePoint = uch;
+        return 1;
+    } else if ((uch & 0xE0) == 0xC0) {
+        // 110yyyxx 10xxxxxx -> 00000yyy xxxxxxxx
+        unsigned char uch2 = *(p_ch + 1);
+        p_codePoint = ((uch & 0x1CL) << 6) + ((uch & 0x3L) << 6) + (uch2 & 0x3FL);
+        return 2;
+    } else if ((uch & 0xF0) == 0xE0) {
+        // 1110yyyy 10yyyyxx 10xxxxxx -> yyyyyyyy xxxxxxxx
+        unsigned char uch2 = *(p_ch + 1);
+        unsigned char uch3 = *(p_ch + 2);
+        p_codePoint = ((uch & 0xF) << 12)
+                      + ((uch2 & 0x3CL) << 6) + ((uch2 & 0x3L) << 6)
+                      + (uch3 & 0x3FL);
+        return 3;
+    } else if ((uch & 0xF8) == 0xF0) {
+        // 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx -> 000zzzzz yyyyyyyy xxxxxxxx
+        unsigned char uch2 = *(p_ch + 1);
+        unsigned char uch3 = *(p_ch + 2);
+        unsigned char uch4 = *(p_ch + 3);
+        p_codePoint = ((uch & 0x7L) << 18)
+                      + ((uch2 & 0x30L) << 12) + ((uch2 & 0xFL) << 12)
+                      + ((uch3 & 0x3CL) << 6) + ((uch3 & 0x3L) << 6)
+                      + (uch4 & 0x3FL);
+        return 4;
+    } else {
+        return -1;
+    }
+}
+
+static inline void copyChars(char *p_dest, const char *p_src, int p_num)
+{
+    for (int i = 0; i < p_num; ++i) {
+        *(p_dest + i) = *(p_src + i);
+    }
+}
+
+// @p_data: UTF-8 data array.
+// If @p_data contain unicode characters with code value above 65535, it will break
+// it into two characters with code value below 65536.
+// Return null if there is no fix. Otherwise, return a fixed copy of the data.
+static QSharedPointer<char> tryFixUnicodeData(const char *p_data)
+{
+    bool needFix = false;
+    int sz = 0;
+
+    const char *ch = p_data;
+    bool hasBOM = false;
+    if (HAS_UTF8_BOM(ch)) {
+        hasBOM = true;
+        ch += 3;
+        sz += 3;
+    }
+
+    // Calculate the size of fixed data.
+    while (*ch != '\0') {
+        int cp;
+        int nr = utf8CodePoint(ch, cp);
+        if (nr == -1) {
+            return NULL;
+        }
+
+        if (cp > MAX_CODE_POINT) {
+            needFix = true;
+            ch += nr;
+            // Use two one-byte chars to replace.
+            sz += 2;
+        } else {
+            ch += nr;
+            sz += nr;
+        }
+    }
+
+    if (!needFix) {
+        return NULL;
+    }
+
+    // Replace those chars with two one-byte chars.
+    QSharedPointer<char> res(new char[sz + 1]);
+    char *newChar = res.data();
+    int idx = 0;
+    ch = p_data;
+    if (hasBOM) {
+        copyChars(newChar + idx, ch, 3);
+        ch += 3;
+        idx += 3;
+    }
+
+    while (*ch != '\0') {
+        int cp;
+        int nr = utf8CodePoint(ch, cp);
+        Q_ASSERT(nr > 0);
+        if (cp > MAX_CODE_POINT) {
+            *(newChar + idx) = X_CHAR;
+            *(newChar + idx + 1) = X_CHAR;
+            ch += nr;
+            idx += 2;
+        } else {
+            copyChars(newChar + idx, ch, nr);
+            ch += nr;
+            idx += nr;
+        }
+    }
+
+    Q_ASSERT(idx == sz);
+    *(newChar + sz) = '\0';
+
+    return res;
+}
+
 pmh_element **PegParser::parseMarkdownToElements(const QSharedPointer<PegParseConfig> &p_config)
 {
     if (p_config->m_data.isEmpty()) {
@@ -403,7 +528,19 @@ pmh_element **PegParser::parseMarkdownToElements(const QSharedPointer<PegParseCo
     }
 
     pmh_element **pmhResult = NULL;
+
+    // p_config->m_data is encoding in UTF-8.
+    // QString stores a string of 16-bit QChars. Unicode characters with code values above 65535 are stored using surrogate pairs, i.e., two consecutive QChars.
+    // Hence, a QString using two QChars to save one code value if it's above 65535, with size()
+    // returning 2. pmh_markdown_to_elements() will treat it at the size of 1 (expectively).
+    // To make it work, we split unicode characters whose code value is above 65535 into two unicode
+    // characters whose code value is below 65535.
     char *data = p_config->m_data.data();
+    QSharedPointer<char> fixedData = tryFixUnicodeData(data);
+    if (fixedData) {
+        data = fixedData.data();
+    }
+
     pmh_markdown_to_elements(data, p_config->m_extensions, &pmhResult);
     return pmhResult;
 }