|  | @@ -396,6 +396,131 @@ QVector<VElementRegion> PegParser::parseImageRegions(const QSharedPointer<PegPar
 | 
	
		
			
				|  |  |      return regs;
 | 
	
		
			
				|  |  |  }
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  | +#define MAX_CODE_POINT 65535
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +#define X_CHAR 86U
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +#define HAS_UTF8_BOM(x)         ( ((*x & 0xFF) == 0xEF)\
 | 
	
		
			
				|  |  | +                                  && ((*(x+1) & 0xFF) == 0xBB)\
 | 
	
		
			
				|  |  | +                                  && ((*(x+2) & 0xFF) == 0xBF) )
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +// Calculate the UTF8 code point.
 | 
	
		
			
				|  |  | +// Return the number of chars consumed.
 | 
	
		
			
				|  |  | +static inline int utf8CodePoint(const char *p_ch, int &p_codePoint)
 | 
	
		
			
				|  |  | +{
 | 
	
		
			
				|  |  | +    unsigned char uch = *p_ch;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    if ((uch & 0x80) == 0) {
 | 
	
		
			
				|  |  | +        p_codePoint = uch;
 | 
	
		
			
				|  |  | +        return 1;
 | 
	
		
			
				|  |  | +    } else if ((uch & 0xE0) == 0xC0) {
 | 
	
		
			
				|  |  | +        // 110yyyxx 10xxxxxx -> 00000yyy xxxxxxxx
 | 
	
		
			
				|  |  | +        unsigned char uch2 = *(p_ch + 1);
 | 
	
		
			
				|  |  | +        p_codePoint = ((uch & 0x1CL) << 6) + ((uch & 0x3L) << 6) + (uch2 & 0x3FL);
 | 
	
		
			
				|  |  | +        return 2;
 | 
	
		
			
				|  |  | +    } else if ((uch & 0xF0) == 0xE0) {
 | 
	
		
			
				|  |  | +        // 1110yyyy 10yyyyxx 10xxxxxx -> yyyyyyyy xxxxxxxx
 | 
	
		
			
				|  |  | +        unsigned char uch2 = *(p_ch + 1);
 | 
	
		
			
				|  |  | +        unsigned char uch3 = *(p_ch + 2);
 | 
	
		
			
				|  |  | +        p_codePoint = ((uch & 0xF) << 12)
 | 
	
		
			
				|  |  | +                      + ((uch2 & 0x3CL) << 6) + ((uch2 & 0x3L) << 6)
 | 
	
		
			
				|  |  | +                      + (uch3 & 0x3FL);
 | 
	
		
			
				|  |  | +        return 3;
 | 
	
		
			
				|  |  | +    } else if ((uch & 0xF8) == 0xF0) {
 | 
	
		
			
				|  |  | +        // 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx -> 000zzzzz yyyyyyyy xxxxxxxx
 | 
	
		
			
				|  |  | +        unsigned char uch2 = *(p_ch + 1);
 | 
	
		
			
				|  |  | +        unsigned char uch3 = *(p_ch + 2);
 | 
	
		
			
				|  |  | +        unsigned char uch4 = *(p_ch + 3);
 | 
	
		
			
				|  |  | +        p_codePoint = ((uch & 0x7L) << 18)
 | 
	
		
			
				|  |  | +                      + ((uch2 & 0x30L) << 12) + ((uch2 & 0xFL) << 12)
 | 
	
		
			
				|  |  | +                      + ((uch3 & 0x3CL) << 6) + ((uch3 & 0x3L) << 6)
 | 
	
		
			
				|  |  | +                      + (uch4 & 0x3FL);
 | 
	
		
			
				|  |  | +        return 4;
 | 
	
		
			
				|  |  | +    } else {
 | 
	
		
			
				|  |  | +        return -1;
 | 
	
		
			
				|  |  | +    }
 | 
	
		
			
				|  |  | +}
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +static inline void copyChars(char *p_dest, const char *p_src, int p_num)
 | 
	
		
			
				|  |  | +{
 | 
	
		
			
				|  |  | +    for (int i = 0; i < p_num; ++i) {
 | 
	
		
			
				|  |  | +        *(p_dest + i) = *(p_src + i);
 | 
	
		
			
				|  |  | +    }
 | 
	
		
			
				|  |  | +}
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +// @p_data: UTF-8 data array.
 | 
	
		
			
				|  |  | +// If @p_data contain unicode characters with code value above 65535, it will break
 | 
	
		
			
				|  |  | +// it into two characters with code value below 65536.
 | 
	
		
			
				|  |  | +// Return null if there is no fix. Otherwise, return a fixed copy of the data.
 | 
	
		
			
				|  |  | +static QSharedPointer<char> tryFixUnicodeData(const char *p_data)
 | 
	
		
			
				|  |  | +{
 | 
	
		
			
				|  |  | +    bool needFix = false;
 | 
	
		
			
				|  |  | +    int sz = 0;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    const char *ch = p_data;
 | 
	
		
			
				|  |  | +    bool hasBOM = false;
 | 
	
		
			
				|  |  | +    if (HAS_UTF8_BOM(ch)) {
 | 
	
		
			
				|  |  | +        hasBOM = true;
 | 
	
		
			
				|  |  | +        ch += 3;
 | 
	
		
			
				|  |  | +        sz += 3;
 | 
	
		
			
				|  |  | +    }
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    // Calculate the size of fixed data.
 | 
	
		
			
				|  |  | +    while (*ch != '\0') {
 | 
	
		
			
				|  |  | +        int cp;
 | 
	
		
			
				|  |  | +        int nr = utf8CodePoint(ch, cp);
 | 
	
		
			
				|  |  | +        if (nr == -1) {
 | 
	
		
			
				|  |  | +            return NULL;
 | 
	
		
			
				|  |  | +        }
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +        if (cp > MAX_CODE_POINT) {
 | 
	
		
			
				|  |  | +            needFix = true;
 | 
	
		
			
				|  |  | +            ch += nr;
 | 
	
		
			
				|  |  | +            // Use two one-byte chars to replace.
 | 
	
		
			
				|  |  | +            sz += 2;
 | 
	
		
			
				|  |  | +        } else {
 | 
	
		
			
				|  |  | +            ch += nr;
 | 
	
		
			
				|  |  | +            sz += nr;
 | 
	
		
			
				|  |  | +        }
 | 
	
		
			
				|  |  | +    }
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    if (!needFix) {
 | 
	
		
			
				|  |  | +        return NULL;
 | 
	
		
			
				|  |  | +    }
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    // Replace those chars with two one-byte chars.
 | 
	
		
			
				|  |  | +    QSharedPointer<char> res(new char[sz + 1]);
 | 
	
		
			
				|  |  | +    char *newChar = res.data();
 | 
	
		
			
				|  |  | +    int idx = 0;
 | 
	
		
			
				|  |  | +    ch = p_data;
 | 
	
		
			
				|  |  | +    if (hasBOM) {
 | 
	
		
			
				|  |  | +        copyChars(newChar + idx, ch, 3);
 | 
	
		
			
				|  |  | +        ch += 3;
 | 
	
		
			
				|  |  | +        idx += 3;
 | 
	
		
			
				|  |  | +    }
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    while (*ch != '\0') {
 | 
	
		
			
				|  |  | +        int cp;
 | 
	
		
			
				|  |  | +        int nr = utf8CodePoint(ch, cp);
 | 
	
		
			
				|  |  | +        Q_ASSERT(nr > 0);
 | 
	
		
			
				|  |  | +        if (cp > MAX_CODE_POINT) {
 | 
	
		
			
				|  |  | +            *(newChar + idx) = X_CHAR;
 | 
	
		
			
				|  |  | +            *(newChar + idx + 1) = X_CHAR;
 | 
	
		
			
				|  |  | +            ch += nr;
 | 
	
		
			
				|  |  | +            idx += 2;
 | 
	
		
			
				|  |  | +        } else {
 | 
	
		
			
				|  |  | +            copyChars(newChar + idx, ch, nr);
 | 
	
		
			
				|  |  | +            ch += nr;
 | 
	
		
			
				|  |  | +            idx += nr;
 | 
	
		
			
				|  |  | +        }
 | 
	
		
			
				|  |  | +    }
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    Q_ASSERT(idx == sz);
 | 
	
		
			
				|  |  | +    *(newChar + sz) = '\0';
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    return res;
 | 
	
		
			
				|  |  | +}
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  |  pmh_element **PegParser::parseMarkdownToElements(const QSharedPointer<PegParseConfig> &p_config)
 | 
	
		
			
				|  |  |  {
 | 
	
		
			
				|  |  |      if (p_config->m_data.isEmpty()) {
 | 
	
	
		
			
				|  | @@ -403,7 +528,19 @@ pmh_element **PegParser::parseMarkdownToElements(const QSharedPointer<PegParseCo
 | 
	
		
			
				|  |  |      }
 | 
	
		
			
				|  |  |  
 | 
	
		
			
				|  |  |      pmh_element **pmhResult = NULL;
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  | +    // p_config->m_data is encoding in UTF-8.
 | 
	
		
			
				|  |  | +    // QString stores a string of 16-bit QChars. Unicode characters with code values above 65535 are stored using surrogate pairs, i.e., two consecutive QChars.
 | 
	
		
			
				|  |  | +    // Hence, a QString using two QChars to save one code value if it's above 65535, with size()
 | 
	
		
			
				|  |  | +    // returning 2. pmh_markdown_to_elements() will treat it at the size of 1 (expectively).
 | 
	
		
			
				|  |  | +    // To make it work, we split unicode characters whose code value is above 65535 into two unicode
 | 
	
		
			
				|  |  | +    // characters whose code value is below 65535.
 | 
	
		
			
				|  |  |      char *data = p_config->m_data.data();
 | 
	
		
			
				|  |  | +    QSharedPointer<char> fixedData = tryFixUnicodeData(data);
 | 
	
		
			
				|  |  | +    if (fixedData) {
 | 
	
		
			
				|  |  | +        data = fixedData.data();
 | 
	
		
			
				|  |  | +    }
 | 
	
		
			
				|  |  | +
 | 
	
		
			
				|  |  |      pmh_markdown_to_elements(data, p_config->m_extensions, &pmhResult);
 | 
	
		
			
				|  |  |      return pmhResult;
 | 
	
		
			
				|  |  |  }
 |