7 gadi atpakaļ · 8dbcf139d8
--- a/src/pegparser.cpp
+++ b/src/pegparser.cpp
@@ -396,6 +396,131 @@ QVector<VElementRegion> PegParser::parseImageRegions(const QSharedPointer<PegPar
 
				     return regs;
			
 
				 }
			
 
				 
			
 
				+#define MAX_CODE_POINT 65535
			
 
				+
			
 
				+#define X_CHAR 86U
			
 
				+
			
 
				+#define HAS_UTF8_BOM(x)         ( ((*x & 0xFF) == 0xEF)\
			
 
				+                                  && ((*(x+1) & 0xFF) == 0xBB)\
			
 
				+                                  && ((*(x+2) & 0xFF) == 0xBF) )
			
 
				+
			
 
				+// Calculate the UTF8 code point.
			
 
				+// Return the number of chars consumed.
			
 
				+static inline int utf8CodePoint(const char *p_ch, int &p_codePoint)
			
 
				+{
			
 
				+    unsigned char uch = *p_ch;
			
 
				+
			
 
				+    if ((uch & 0x80) == 0) {
			
 
				+        p_codePoint = uch;
			
 
				+        return 1;
			
 
				+    } else if ((uch & 0xE0) == 0xC0) {
			
 
				+        // 110yyyxx 10xxxxxx -> 00000yyy xxxxxxxx
			
 
				+        unsigned char uch2 = *(p_ch + 1);
			
 
				+        p_codePoint = ((uch & 0x1CL) << 6) + ((uch & 0x3L) << 6) + (uch2 & 0x3FL);
			
 
				+        return 2;
			
 
				+    } else if ((uch & 0xF0) == 0xE0) {
			
 
				+        // 1110yyyy 10yyyyxx 10xxxxxx -> yyyyyyyy xxxxxxxx
			
 
				+        unsigned char uch2 = *(p_ch + 1);
			
 
				+        unsigned char uch3 = *(p_ch + 2);
			
 
				+        p_codePoint = ((uch & 0xF) << 12)
			
 
				+                      + ((uch2 & 0x3CL) << 6) + ((uch2 & 0x3L) << 6)
			
 
				+                      + (uch3 & 0x3FL);
			
 
				+        return 3;
			
 
				+    } else if ((uch & 0xF8) == 0xF0) {
			
 
				+        // 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx -> 000zzzzz yyyyyyyy xxxxxxxx
			
 
				+        unsigned char uch2 = *(p_ch + 1);
			
 
				+        unsigned char uch3 = *(p_ch + 2);
			
 
				+        unsigned char uch4 = *(p_ch + 3);
			
 
				+        p_codePoint = ((uch & 0x7L) << 18)
			
 
				+                      + ((uch2 & 0x30L) << 12) + ((uch2 & 0xFL) << 12)
			
 
				+                      + ((uch3 & 0x3CL) << 6) + ((uch3 & 0x3L) << 6)
			
 
				+                      + (uch4 & 0x3FL);
			
 
				+        return 4;
			
 
				+    } else {
			
 
				+        return -1;
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+static inline void copyChars(char *p_dest, const char *p_src, int p_num)
			
 
				+{
			
 
				+    for (int i = 0; i < p_num; ++i) {
			
 
				+        *(p_dest + i) = *(p_src + i);
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+// @p_data: UTF-8 data array.
			
 
				+// If @p_data contain unicode characters with code value above 65535, it will break
			
 
				+// it into two characters with code value below 65536.
			
 
				+// Return null if there is no fix. Otherwise, return a fixed copy of the data.
			
 
				+static QSharedPointer<char> tryFixUnicodeData(const char *p_data)
			
 
				+{
			
 
				+    bool needFix = false;
			
 
				+    int sz = 0;
			
 
				+
			
 
				+    const char *ch = p_data;
			
 
				+    bool hasBOM = false;
			
 
				+    if (HAS_UTF8_BOM(ch)) {
			
 
				+        hasBOM = true;
			
 
				+        ch += 3;
			
 
				+        sz += 3;
			
 
				+    }
			
 
				+
			
 
				+    // Calculate the size of fixed data.
			
 
				+    while (*ch != '\0') {
			
 
				+        int cp;
			
 
				+        int nr = utf8CodePoint(ch, cp);
			
 
				+        if (nr == -1) {
			
 
				+            return NULL;
			
 
				+        }
			
 
				+
			
 
				+        if (cp > MAX_CODE_POINT) {
			
 
				+            needFix = true;
			
 
				+            ch += nr;
			
 
				+            // Use two one-byte chars to replace.
			
 
				+            sz += 2;
			
 
				+        } else {
			
 
				+            ch += nr;
			
 
				+            sz += nr;
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    if (!needFix) {
			
 
				+        return NULL;
			
 
				+    }
			
 
				+
			
 
				+    // Replace those chars with two one-byte chars.
			
 
				+    QSharedPointer<char> res(new char[sz + 1]);
			
 
				+    char *newChar = res.data();
			
 
				+    int idx = 0;
			
 
				+    ch = p_data;
			
 
				+    if (hasBOM) {
			
 
				+        copyChars(newChar + idx, ch, 3);
			
 
				+        ch += 3;
			
 
				+        idx += 3;
			
 
				+    }
			
 
				+
			
 
				+    while (*ch != '\0') {
			
 
				+        int cp;
			
 
				+        int nr = utf8CodePoint(ch, cp);
			
 
				+        Q_ASSERT(nr > 0);
			
 
				+        if (cp > MAX_CODE_POINT) {
			
 
				+            *(newChar + idx) = X_CHAR;
			
 
				+            *(newChar + idx + 1) = X_CHAR;
			
 
				+            ch += nr;
			
 
				+            idx += 2;
			
 
				+        } else {
			
 
				+            copyChars(newChar + idx, ch, nr);
			
 
				+            ch += nr;
			
 
				+            idx += nr;
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    Q_ASSERT(idx == sz);
			
 
				+    *(newChar + sz) = '\0';
			
 
				+
			
 
				+    return res;
			
 
				+}
			
 
				+
			
 
				 pmh_element **PegParser::parseMarkdownToElements(const QSharedPointer<PegParseConfig> &p_config)
			
 
				 {
			
 
				     if (p_config->m_data.isEmpty()) {
			
@@ -403,7 +528,19 @@ pmh_element **PegParser::parseMarkdownToElements(const QSharedPointer<PegParseCo
 
				     }
			
 
				 
			
 
				     pmh_element **pmhResult = NULL;
			
 
				+
			
 
				+    // p_config->m_data is encoding in UTF-8.
			
 
				+    // QString stores a string of 16-bit QChars. Unicode characters with code values above 65535 are stored using surrogate pairs, i.e., two consecutive QChars.
			
 
				+    // Hence, a QString using two QChars to save one code value if it's above 65535, with size()
			
 
				+    // returning 2. pmh_markdown_to_elements() will treat it at the size of 1 (expectively).
			
 
				+    // To make it work, we split unicode characters whose code value is above 65535 into two unicode
			
 
				+    // characters whose code value is below 65535.
			
 
				     char *data = p_config->m_data.data();
			
 
				+    QSharedPointer<char> fixedData = tryFixUnicodeData(data);
			
 
				+    if (fixedData) {
			
 
				+        data = fixedData.data();
			
 
				+    }
			
 
				+
			
 
				     pmh_markdown_to_elements(data, p_config->m_extensions, &pmhResult);
			
 
				     return pmhResult;
			
 
				 }