Parser.cpp 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800
  1. //
  2. // Parser.cpp
  3. //
  4. // $Id$
  5. //
  6. // Library: JSON
  7. // Package: JSON
  8. // Module: Parser
  9. //
  10. // Copyright (c) 2012, Applied Informatics Software Engineering GmbH.
  11. // and Contributors.
  12. //
  13. // Permission is hereby granted, free of charge, to any person or organization
  14. // obtaining a copy of the software and accompanying documentation covered by
  15. // this license (the "Software") to use, reproduce, display, distribute,
  16. // execute, and transmit the Software, and to prepare derivative works of the
  17. // Software, and to permit third-parties to whom the Software is furnished to
  18. // do so, all subject to the following:
  19. //
  20. // The copyright notices in the Software and this entire statement, including
  21. // the above license grant, this restriction and the following disclaimer,
  22. // must be included in all copies of the Software, in whole or in part, and
  23. // all derivative works of the Software, unless such copies or derivative
  24. // works are solely in the form of machine-executable object code generated by
  25. // a source language processor.
  26. //
  27. // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  28. // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  29. // FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
  30. // SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
  31. // FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
  32. // ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  33. // DEALINGS IN THE SOFTWARE.
  34. //
  35. #include "Poco/JSON/Parser.h"
  36. #include "Poco/JSON/JSONException.h"
  37. #include "Poco/Ascii.h"
  38. #include "Poco/Token.h"
  39. #include "Poco/UTF8Encoding.h"
  40. #undef min
  41. #undef max
  42. #include <limits>
  43. namespace Poco {
  44. namespace JSON {
  45. class SeparatorToken: public Token
  46. {
  47. public:
  48. SeparatorToken()
  49. {
  50. }
  51. virtual ~SeparatorToken()
  52. {
  53. }
  54. Class tokenClass() const
  55. {
  56. return Token::SEPARATOR_TOKEN;
  57. }
  58. bool start(char c, std::istream& istr)
  59. {
  60. if (c == '{'
  61. || c == '}'
  62. || c == ']'
  63. || c == '['
  64. || c == ','
  65. || c == ':')
  66. {
  67. _value = c;
  68. return true;
  69. }
  70. if ( c == '\'' )
  71. {
  72. throw JSONException("Invalid quote found");
  73. }
  74. else return false;
  75. }
  76. void finish(std::istream& istr)
  77. {
  78. }
  79. };
  80. class StringToken: public Token
  81. {
  82. public:
  83. StringToken()
  84. {
  85. }
  86. virtual ~StringToken()
  87. {
  88. }
  89. Class tokenClass() const
  90. {
  91. return Token::STRING_LITERAL_TOKEN;
  92. }
  93. bool start(char c, std::istream& istr)
  94. {
  95. if (c == '"')
  96. {
  97. _value = ""; // We don't need the quote!
  98. return true;
  99. }
  100. else return false;
  101. }
  102. void finish(std::istream& istr)
  103. {
  104. int c = 0;
  105. while ((c = istr.get()) != -1)
  106. {
  107. if (c == 0)
  108. {
  109. throw JSONException("Null byte not allowed");
  110. }
  111. if ( 0 < c && c <= 0x1F )
  112. {
  113. throw JSONException(format("Control character 0x%x not allowed", (unsigned int) c));
  114. }
  115. if (c == '"')
  116. break;
  117. if(0x80 <= c && c <= 0xFF)
  118. {
  119. int count = utf8_check_first(c);
  120. if (!count)
  121. {
  122. throw JSONException(format("Unable to decode byte 0x%x", (unsigned int) c));
  123. }
  124. char buffer[5];
  125. buffer[0] = c;
  126. for(int i = 1; i < count; ++i)
  127. {
  128. buffer[i] = istr.get();
  129. }
  130. if ( !UTF8Encoding::isLegal((unsigned char*) buffer, count) )
  131. {
  132. throw JSONException("No legal UTF8 found");
  133. }
  134. buffer[count] = '\0';
  135. _value += buffer;
  136. continue;
  137. }
  138. if (c == '\\') // Escaped String
  139. {
  140. c = istr.get();
  141. switch(c)
  142. {
  143. case '"' : c = '"'; break;
  144. case '\\' : c = '\\'; break;
  145. case '/' : c = '/'; break;
  146. case 'b' : c = '\b'; break;
  147. case 'f' : c = '\f'; break;
  148. case 'n' : c = '\n'; break;
  149. case 'r' : c = '\r'; break;
  150. case 't' : c = '\t'; break;
  151. case 'u' : // Unicode
  152. {
  153. Poco::Int32 unicode = decodeUnicode(istr);
  154. if ( unicode == 0 )
  155. {
  156. throw JSONException("\\u0000 is not allowed");
  157. }
  158. if ( unicode >= 0xD800 && unicode <= 0xDBFF )
  159. {
  160. c = istr.get();
  161. if ( c != '\\' )
  162. {
  163. throw JSONException("Invalid unicode surrogate pair");
  164. }
  165. c = istr.get();
  166. if ( c != 'u' )
  167. {
  168. throw JSONException("Invalid unicode surrogate pair");
  169. }
  170. Poco::Int32 surrogatePair = decodeUnicode(istr);
  171. if ( 0xDC00 <= surrogatePair && surrogatePair <= 0xDFFF )
  172. {
  173. unicode = 0x10000 + ((unicode & 0x3FF) << 10) + (surrogatePair & 0x3FF);
  174. }
  175. else
  176. {
  177. throw JSONException("Invalid unicode surrogate pair");
  178. }
  179. }
  180. else if ( 0xDC00 <= unicode && unicode <= 0xDFFF )
  181. {
  182. throw JSONException("Invalid unicode");
  183. }
  184. Poco::UTF8Encoding utf8encoding;
  185. int length = utf8encoding.convert(unicode, NULL, 0);
  186. std::vector<unsigned char> convert(length);
  187. utf8encoding.convert(unicode, &convert[0], length);
  188. for(int i = 0; i < length; ++i)
  189. {
  190. _value += (char) convert[i];
  191. }
  192. continue;
  193. }
  194. default:
  195. {
  196. throw JSONException(format("Invalid escape '%c' character used", (char) c));
  197. }
  198. }
  199. }
  200. _value += c;
  201. }
  202. if ( c == -1 )
  203. {
  204. throw JSONException("Unterminated string found");
  205. }
  206. }
  207. Poco::Int32 decodeUnicode(std::istream& istr)
  208. {
  209. Poco::Int32 value = 0;
  210. for(int i = 0; i < 4; i++)
  211. {
  212. value <<= 4;
  213. int nc = istr.peek();
  214. if ( nc == -1 )
  215. {
  216. throw JSONException("Invalid unicode sequence");
  217. }
  218. istr.get(); // No EOF, so read the character
  219. if (nc >= '0' && nc <= '9')
  220. value += nc - '0';
  221. else if (nc >= 'A' && nc <= 'F')
  222. value += 10 + nc - 'A';
  223. else if (nc >= 'a' && nc <= 'f')
  224. value += 10 + nc - 'a';
  225. else
  226. throw JSONException("Invalid unicode sequence. Hexadecimal digit expected");
  227. }
  228. return value;
  229. }
  230. private:
  231. int utf8_check_first(char byte)
  232. {
  233. unsigned char u = (unsigned char) byte;
  234. if(u < 0x80)
  235. return 1;
  236. if (0x80 <= u && u <= 0xBF)
  237. {
  238. /* second, third or fourth byte of a multi-byte
  239. sequence, i.e. a "continuation byte" */
  240. return 0;
  241. }
  242. else if(u == 0xC0 || u == 0xC1)
  243. {
  244. /* overlong encoding of an ASCII byte */
  245. return 0;
  246. }
  247. else if(0xC2 <= u && u <= 0xDF)
  248. {
  249. /* 2-byte sequence */
  250. return 2;
  251. }
  252. else if(0xE0 <= u && u <= 0xEF)
  253. {
  254. /* 3-byte sequence */
  255. return 3;
  256. }
  257. else if(0xF0 <= u && u <= 0xF4)
  258. {
  259. /* 4-byte sequence */
  260. return 4;
  261. }
  262. else
  263. {
  264. /* u >= 0xF5 */
  265. /* Restricted (start of 4-, 5- or 6-byte sequence) or invalid
  266. UTF-8 */
  267. return 0;
  268. }
  269. }
  270. };
  271. class KeywordToken : public Token
  272. {
  273. public:
  274. KeywordToken()
  275. {
  276. }
  277. virtual ~KeywordToken()
  278. {
  279. }
  280. Class tokenClass() const
  281. {
  282. return Token::KEYWORD_TOKEN;
  283. }
  284. bool start(char c, std::istream& istr)
  285. {
  286. if ( Ascii::isAlpha(c) )
  287. {
  288. _value = c;
  289. return true;
  290. }
  291. return false;
  292. }
  293. void finish(std::istream& istr)
  294. {
  295. int c = istr.peek();
  296. while (c != -1 && Ascii::isAlpha(c) )
  297. {
  298. istr.get();
  299. _value += c;
  300. c = istr.peek();
  301. }
  302. }
  303. };
  304. class NumberToken: public Token
  305. {
  306. public:
  307. NumberToken() : _activeClass(INTEGER_LITERAL_TOKEN)
  308. {
  309. }
  310. virtual ~NumberToken()
  311. {
  312. }
  313. Class tokenClass() const
  314. {
  315. return _activeClass;
  316. }
  317. bool start(char c, std::istream& istr)
  318. {
  319. // Reset the active class to integer
  320. _activeClass = INTEGER_LITERAL_TOKEN;
  321. if ( c == -1 )
  322. return false;
  323. if (Ascii::isDigit(c))
  324. {
  325. if ( c == '0' )
  326. {
  327. int nc = istr.peek();
  328. if ( Ascii::isDigit(nc) ) // A digit after a zero is not allowed
  329. {
  330. throw JSONException("Number can't start with a zero");
  331. }
  332. }
  333. _value = c;
  334. return true;
  335. }
  336. if (c == '-')
  337. {
  338. _value = c;
  339. int nc = istr.peek();
  340. if (Ascii::isDigit(nc))
  341. {
  342. if (nc == '0')
  343. {
  344. _value += '0';
  345. istr.get();
  346. nc = istr.peek();
  347. if ( Ascii::isDigit(nc) ) // A digit after -0 is not allowed
  348. {
  349. throw JSONException("Number can't start with a zero");
  350. }
  351. }
  352. return true;
  353. }
  354. }
  355. return false;
  356. }
  357. void finish(std::istream& istr)
  358. {
  359. int c;
  360. while( (c = istr.peek()) != -1)
  361. {
  362. if (Ascii::isDigit(c))
  363. {
  364. _value += c;
  365. istr.get();
  366. }
  367. else
  368. {
  369. switch(c)
  370. {
  371. case '.': // Float
  372. {
  373. if (_activeClass == Token::FLOAT_LITERAL_TOKEN)
  374. {
  375. throw JSONException("Invalid float value");
  376. }
  377. _activeClass = Token::FLOAT_LITERAL_TOKEN;
  378. _value += c;
  379. istr.get();
  380. // After a . we need a digit
  381. c = istr.peek();
  382. if ( ! Ascii::isDigit(c) )
  383. {
  384. throw JSONException("Invalid float value");
  385. }
  386. break;
  387. }
  388. case 'E':
  389. case 'e':
  390. {
  391. if (_activeClass == Token::DOUBLE_LITERAL_TOKEN)
  392. {
  393. throw JSONException("Invalid double value");
  394. }
  395. _activeClass = Token::DOUBLE_LITERAL_TOKEN;
  396. // Add the e or E
  397. _value += c;
  398. istr.get();
  399. // When the next char is - or + then read the next char
  400. c = istr.peek();
  401. if (c == '-' || c == '+')
  402. {
  403. _value += c;
  404. istr.get();
  405. c = istr.peek();
  406. }
  407. if (! Ascii::isDigit(c))
  408. {
  409. throw JSONException("Invalid double value");
  410. }
  411. break;
  412. }
  413. default:
  414. return; // End of number token
  415. }
  416. istr.get(); // If we get here we have a valid character for a number
  417. _value += c;
  418. }
  419. }
  420. }
  421. private:
  422. Class _activeClass;
  423. };
  424. Parser::Parser() : _tokenizer(), _handler(NULL)
  425. {
  426. _tokenizer.addToken(new WhitespaceToken());
  427. _tokenizer.addToken(new InvalidToken());
  428. _tokenizer.addToken(new SeparatorToken());
  429. _tokenizer.addToken(new StringToken());
  430. _tokenizer.addToken(new NumberToken());
  431. _tokenizer.addToken(new KeywordToken());
  432. }
  433. Parser::~Parser()
  434. {
  435. }
  436. const Token* Parser::nextToken()
  437. {
  438. const Token* token = _tokenizer.next();
  439. if (token->is(Token::EOF_TOKEN))
  440. {
  441. throw JSONException("Unexpected EOF found");
  442. }
  443. return token;
  444. }
  445. void Parser::parse(std::istream& in)
  446. {
  447. _tokenizer.attachToStream(in);
  448. const Token* token = nextToken();
  449. if (token->is(Token::SEPARATOR_TOKEN))
  450. {
  451. // This must be a { or a [
  452. if (token->asChar() == '{')
  453. {
  454. readObject();
  455. }
  456. else if (token->asChar() == '[')
  457. {
  458. readArray();
  459. }
  460. else
  461. {
  462. throw JSONException(format("Invalid separator '%c' found. Expecting { or [", token->asChar()));
  463. }
  464. token = _tokenizer.next();
  465. if (! token->is(Token::EOF_TOKEN))
  466. {
  467. throw JSONException(format("EOF expected but found '%s'", token->asString()));
  468. }
  469. }
  470. else
  471. {
  472. throw JSONException(format("Invalid token '%s' found. Expecting { or [", token->asString()));
  473. }
  474. }
  475. void Parser::readObject()
  476. {
  477. if (_handler != NULL)
  478. {
  479. _handler->startObject();
  480. }
  481. if ( readRow(true) ) // First call is special: check for empty object
  482. {
  483. while(readRow());
  484. }
  485. if (_handler != NULL)
  486. {
  487. _handler->endObject();
  488. }
  489. }
  490. bool Parser::readRow(bool firstCall)
  491. {
  492. const Token* token = nextToken();
  493. if (firstCall && token->tokenClass() == Token::SEPARATOR_TOKEN && token->asChar() == '}')
  494. {
  495. return false; // End of object is possible for an empty object
  496. }
  497. if (token->tokenClass() == Token::STRING_LITERAL_TOKEN)
  498. {
  499. std::string propertyName = token->tokenString();
  500. if ( _handler != NULL )
  501. {
  502. _handler->key(propertyName);
  503. }
  504. token = nextToken();
  505. if (token->is(Token::SEPARATOR_TOKEN)
  506. && token->asChar() == ':')
  507. {
  508. readValue(nextToken());
  509. token = nextToken();
  510. if (token->is(Token::SEPARATOR_TOKEN))
  511. {
  512. if (token->asChar() == ',')
  513. {
  514. if (_handler != NULL)
  515. {
  516. _handler->comma();
  517. }
  518. return true; // Read next row
  519. }
  520. else if (token->asChar() == '}')
  521. {
  522. return false; // End of object
  523. }
  524. else
  525. {
  526. throw JSONException(format("Invalid separator '%c' found. Expecting , or }", token->asChar()));
  527. }
  528. }
  529. else
  530. {
  531. throw JSONException(format("Invalid token '%s' found. Expecting , or }", token->asString()));
  532. }
  533. }
  534. else
  535. {
  536. throw JSONException(format("Invalid token '%s' found. Expecting :", token->asString()));
  537. }
  538. }
  539. else
  540. {
  541. throw JSONException(format("Invalid token '%s' found. Expecting key", token->asString()));
  542. }
  543. }
  544. void Parser::readValue(const Token* token)
  545. {
  546. switch(token->tokenClass())
  547. {
  548. default:
  549. case Token::IDENTIFIER_TOKEN:
  550. case Token::OPERATOR_TOKEN:
  551. case Token::CHAR_LITERAL_TOKEN:
  552. break;
  553. case Token::INTEGER_LITERAL_TOKEN:
  554. if (_handler != NULL)
  555. {
  556. #if defined(POCO_HAVE_INT64)
  557. try
  558. {
  559. Int64 value = token->asInteger64();
  560. // if number is 32-bit, then handle as such
  561. if ( value > std::numeric_limits<int>::max()
  562. || value < std::numeric_limits<int>::min() )
  563. {
  564. _handler->value(value);
  565. }
  566. else
  567. {
  568. _handler->value(static_cast<int>(value));
  569. }
  570. }
  571. // try to handle error as unsigned in case of overflow
  572. catch ( const SyntaxException& )
  573. {
  574. UInt64 value = token->asUnsignedInteger64();
  575. // if number is 32-bit, then handle as such
  576. if ( value > std::numeric_limits<unsigned>::max() )
  577. {
  578. _handler->value(value);
  579. }
  580. else
  581. {
  582. _handler->value(static_cast<unsigned>(value));
  583. }
  584. }
  585. #else
  586. try
  587. {
  588. int value = token->asInteger();
  589. _handle->value(value);
  590. }
  591. // try to handle error as unsigned in case of overflow
  592. catch ( const SyntaxException& )
  593. {
  594. unsigned value = token->asUnsignedInteger();
  595. _handle->value(value);
  596. }
  597. #endif
  598. }
  599. break;
  600. case Token::KEYWORD_TOKEN:
  601. {
  602. if (token->tokenString().compare("null") == 0)
  603. {
  604. if (_handler != NULL)
  605. {
  606. _handler->null();
  607. }
  608. }
  609. else if (token->tokenString().compare("true") == 0)
  610. {
  611. if (_handler != NULL)
  612. {
  613. _handler->value(true);
  614. }
  615. }
  616. else if (token->tokenString().compare("false") == 0)
  617. {
  618. if (_handler != NULL)
  619. {
  620. _handler->value(false);
  621. }
  622. }
  623. else
  624. {
  625. throw JSONException(format("Invalid keyword '%s' found", token->asString()));
  626. }
  627. break;
  628. }
  629. case Token::FLOAT_LITERAL_TOKEN:
  630. // Fall through
  631. case Token::DOUBLE_LITERAL_TOKEN:
  632. if (_handler != NULL)
  633. {
  634. _handler->value(token->asFloat());
  635. }
  636. break;
  637. case Token::STRING_LITERAL_TOKEN:
  638. if (_handler != NULL)
  639. {
  640. _handler->value(token->tokenString());
  641. }
  642. break;
  643. case Token::SEPARATOR_TOKEN:
  644. {
  645. if (token->asChar() == '{')
  646. {
  647. readObject();
  648. }
  649. else if (token->asChar() == '[')
  650. {
  651. readArray();
  652. }
  653. break;
  654. }
  655. case Token::INVALID_TOKEN:
  656. throw JSONException(format("Invalid token '%s' found", token->asString()));
  657. }
  658. }
  659. void Parser::readArray()
  660. {
  661. if (_handler != NULL)
  662. {
  663. _handler->startArray();
  664. }
  665. if (readElements(true)) // First call is special: check for empty array
  666. {
  667. while(readElements());
  668. }
  669. if (_handler != NULL)
  670. {
  671. _handler->endArray();
  672. }
  673. }
  674. bool Parser::readElements(bool firstCall)
  675. {
  676. const Token* token = nextToken();
  677. if (firstCall && token->is(Token::SEPARATOR_TOKEN) && token->asChar() == ']')
  678. {
  679. // End of array is possible for an empty array
  680. return false;
  681. }
  682. readValue(token);
  683. token = nextToken();
  684. if (token->is(Token::SEPARATOR_TOKEN))
  685. {
  686. if (token->asChar() == ']')
  687. return false; // End of array
  688. if (token->asChar() == ',')
  689. {
  690. if (_handler != NULL)
  691. {
  692. _handler->comma();
  693. }
  694. return true;
  695. }
  696. throw JSONException(format("Invalid separator '%c' found. Expecting , or ]", token->asChar()));
  697. }
  698. throw JSONException(format("Invalid token '%s' found.", token->asString()));
  699. }
  700. } } // namespace Poco::JSON