UrlDecoder.cs 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627
  1. // Licensed to the .NET Foundation under one or more agreements.
  2. // The .NET Foundation licenses this file to you under the MIT license.
  3. // See the LICENSE file in the project root for more information.
  4. using System;
  5. using System.Runtime.CompilerServices;
  6. namespace Microsoft.AspNetCore.Internal
  7. {
  8. internal class UrlDecoder
  9. {
  10. /// <summary>
  11. /// Unescape a URL path
  12. /// </summary>
  13. /// <param name="source">The byte span represents a UTF8 encoding url path.</param>
  14. /// <param name="destination">The byte span where unescaped url path is copied to.</param>
  15. /// <param name="isFormEncoding">Whether we are doing form encoding or not.</param>
  16. /// <returns>The length of the byte sequence of the unescaped url path.</returns>
  17. public static int DecodeRequestLine(ReadOnlySpan<byte> source, Span<byte> destination, bool isFormEncoding)
  18. {
  19. if (destination.Length < source.Length)
  20. {
  21. throw new ArgumentException(
  22. "Length of the destination byte span is less than the source.",
  23. nameof(destination));
  24. }
  25. // This requires the destination span to be larger or equal to source span
  26. source.CopyTo(destination);
  27. return DecodeInPlace(destination.Slice(0, source.Length), isFormEncoding);
  28. }
  29. /// <summary>
  30. /// Unescape a URL path in place.
  31. /// </summary>
  32. /// <param name="buffer">The byte span represents a UTF8 encoding url path.</param>
  33. /// <param name="isFormEncoding">Whether we are doing form encoding or not.</param>
  34. /// <returns>The number of the bytes representing the result.</returns>
  35. /// <remarks>
  36. /// The unescape is done in place, which means after decoding the result is the subset of
  37. /// the input span.
  38. /// </remarks>
  39. public static int DecodeInPlace(Span<byte> buffer, bool isFormEncoding)
  40. {
  41. // the slot to read the input
  42. var sourceIndex = 0;
  43. // the slot to write the unescaped byte
  44. var destinationIndex = 0;
  45. while (true)
  46. {
  47. if (sourceIndex == buffer.Length)
  48. {
  49. break;
  50. }
  51. if (buffer[sourceIndex] == '+' && isFormEncoding)
  52. {
  53. // Set it to ' ' when we are doing form encoding.
  54. buffer[sourceIndex] = 0x20;
  55. }
  56. else if (buffer[sourceIndex] == '%')
  57. {
  58. var decodeIndex = sourceIndex;
  59. // If decoding process succeeds, the writer iterator will be moved
  60. // to the next write-ready location. On the other hand if the scanned
  61. // percent-encodings cannot be interpreted as sequence of UTF-8 octets,
  62. // these bytes should be copied to output as is.
  63. // The decodeReader iterator is always moved to the first byte not yet
  64. // be scanned after the process. A failed decoding means the chars
  65. // between the reader and decodeReader can be copied to output untouched.
  66. if (!DecodeCore(ref decodeIndex, ref destinationIndex, buffer, isFormEncoding))
  67. {
  68. Copy(sourceIndex, decodeIndex, ref destinationIndex, buffer);
  69. }
  70. sourceIndex = decodeIndex;
  71. }
  72. else
  73. {
  74. buffer[destinationIndex++] = buffer[sourceIndex++];
  75. }
  76. }
  77. return destinationIndex;
  78. }
  79. /// <summary>
  80. /// Unescape the percent-encodings
  81. /// </summary>
  82. /// <param name="sourceIndex">The iterator point to the first % char</param>
  83. /// <param name="destinationIndex">The place to write to</param>
  84. /// <param name="buffer">The byte array</param>
  85. /// <param name="isFormEncoding">Whether we are doing form encodoing</param>
  86. private static bool DecodeCore(ref int sourceIndex, ref int destinationIndex, Span<byte> buffer, bool isFormEncoding)
  87. {
  88. // preserves the original head. if the percent-encodings cannot be interpreted as sequence of UTF-8 octets,
  89. // bytes from this till the last scanned one will be copied to the memory pointed by writer.
  90. var byte1 = UnescapePercentEncoding(ref sourceIndex, buffer, isFormEncoding);
  91. if (byte1 == -1)
  92. {
  93. return false;
  94. }
  95. if (byte1 == 0)
  96. {
  97. throw new InvalidOperationException("The path contains null characters.");
  98. }
  99. if (byte1 <= 0x7F)
  100. {
  101. // first byte < U+007f, it is a single byte ASCII
  102. buffer[destinationIndex++] = (byte)byte1;
  103. return true;
  104. }
  105. int byte2 = 0, byte3 = 0, byte4 = 0;
  106. // anticipate more bytes
  107. var currentDecodeBits = 0;
  108. var byteCount = 1;
  109. var expectValueMin = 0;
  110. if ((byte1 & 0xE0) == 0xC0)
  111. {
  112. // 110x xxxx, expect one more byte
  113. currentDecodeBits = byte1 & 0x1F;
  114. byteCount = 2;
  115. expectValueMin = 0x80;
  116. }
  117. else if ((byte1 & 0xF0) == 0xE0)
  118. {
  119. // 1110 xxxx, expect two more bytes
  120. currentDecodeBits = byte1 & 0x0F;
  121. byteCount = 3;
  122. expectValueMin = 0x800;
  123. }
  124. else if ((byte1 & 0xF8) == 0xF0)
  125. {
  126. // 1111 0xxx, expect three more bytes
  127. currentDecodeBits = byte1 & 0x07;
  128. byteCount = 4;
  129. expectValueMin = 0x10000;
  130. }
  131. else
  132. {
  133. // invalid first byte
  134. return false;
  135. }
  136. var remainingBytes = byteCount - 1;
  137. while (remainingBytes > 0)
  138. {
  139. // read following three chars
  140. if (sourceIndex == buffer.Length)
  141. {
  142. return false;
  143. }
  144. var nextSourceIndex = sourceIndex;
  145. var nextByte = UnescapePercentEncoding(ref nextSourceIndex, buffer, isFormEncoding);
  146. if (nextByte == -1)
  147. {
  148. return false;
  149. }
  150. if ((nextByte & 0xC0) != 0x80)
  151. {
  152. // the follow up byte is not in form of 10xx xxxx
  153. return false;
  154. }
  155. currentDecodeBits = (currentDecodeBits << 6) | (nextByte & 0x3F);
  156. remainingBytes--;
  157. if (remainingBytes == 1 && currentDecodeBits >= 0x360 && currentDecodeBits <= 0x37F)
  158. {
  159. // this is going to end up in the range of 0xD800-0xDFFF UTF-16 surrogates that
  160. // are not allowed in UTF-8;
  161. return false;
  162. }
  163. if (remainingBytes == 2 && currentDecodeBits >= 0x110)
  164. {
  165. // this is going to be out of the upper Unicode bound 0x10FFFF.
  166. return false;
  167. }
  168. sourceIndex = nextSourceIndex;
  169. if (byteCount - remainingBytes == 2)
  170. {
  171. byte2 = nextByte;
  172. }
  173. else if (byteCount - remainingBytes == 3)
  174. {
  175. byte3 = nextByte;
  176. }
  177. else if (byteCount - remainingBytes == 4)
  178. {
  179. byte4 = nextByte;
  180. }
  181. }
  182. if (currentDecodeBits < expectValueMin)
  183. {
  184. // overlong encoding (e.g. using 2 bytes to encode something that only needed 1).
  185. return false;
  186. }
  187. // all bytes are verified, write to the output
  188. // TODO: measure later to determine if the performance of following logic can be improved
  189. // the idea is to combine the bytes into short/int and write to span directly to avoid
  190. // range check cost
  191. if (byteCount > 0)
  192. {
  193. buffer[destinationIndex++] = (byte)byte1;
  194. }
  195. if (byteCount > 1)
  196. {
  197. buffer[destinationIndex++] = (byte)byte2;
  198. }
  199. if (byteCount > 2)
  200. {
  201. buffer[destinationIndex++] = (byte)byte3;
  202. }
  203. if (byteCount > 3)
  204. {
  205. buffer[destinationIndex++] = (byte)byte4;
  206. }
  207. return true;
  208. }
  209. private static void Copy<T>(int begin, int end, ref int writer, Span<T> buffer)
  210. {
  211. while (begin != end)
  212. {
  213. buffer[writer++] = buffer[begin++];
  214. }
  215. }
  216. /// <summary>
  217. /// Read the percent-encoding and try unescape it.
  218. ///
  219. /// The operation first peek at the character the <paramref name="scan"/>
  220. /// iterator points at. If it is % the <paramref name="scan"/> is then
  221. /// moved on to scan the following to characters. If the two following
  222. /// characters are hexadecimal literals they will be unescaped and the
  223. /// value will be returned.
  224. ///
  225. /// If the first character is not % the <paramref name="scan"/> iterator
  226. /// will be removed beyond the location of % and -1 will be returned.
  227. ///
  228. /// If the following two characters can't be successfully unescaped the
  229. /// <paramref name="scan"/> iterator will be move behind the % and -1
  230. /// will be returned.
  231. /// </summary>
  232. /// <param name="scan">The value to read</param>
  233. /// <param name="buffer">The byte array</param>
  234. /// <param name="isFormEncoding">Whether we are decoding a form or not. Will escape '/' if we are doing form encoding</param>
  235. /// <returns>The unescaped byte if success. Otherwise return -1.</returns>
  236. private static int UnescapePercentEncoding(ref int scan, Span<byte> buffer, bool isFormEncoding)
  237. {
  238. if (buffer[scan++] != '%')
  239. {
  240. return -1;
  241. }
  242. var probe = scan;
  243. var value1 = ReadHex(ref probe, buffer);
  244. if (value1 == -1)
  245. {
  246. return -1;
  247. }
  248. var value2 = ReadHex(ref probe, buffer);
  249. if (value2 == -1)
  250. {
  251. return -1;
  252. }
  253. if (SkipUnescape(value1, value2, isFormEncoding))
  254. {
  255. return -1;
  256. }
  257. scan = probe;
  258. return (value1 << 4) + value2;
  259. }
  260. /// <summary>
  261. /// Read the next char and convert it into hexadecimal value.
  262. ///
  263. /// The <paramref name="scan"/> index will be moved to the next
  264. /// byte no matter whether the operation successes.
  265. /// </summary>
  266. /// <param name="scan">The index of the byte in the buffer to read</param>
  267. /// <param name="buffer">The byte span from which the hex to be read</param>
  268. /// <returns>The hexadecimal value if successes, otherwise -1.</returns>
  269. private static int ReadHex(ref int scan, Span<byte> buffer)
  270. {
  271. if (scan == buffer.Length)
  272. {
  273. return -1;
  274. }
  275. var value = buffer[scan++];
  276. var isHex = ((value >= '0') && (value <= '9')) ||
  277. ((value >= 'A') && (value <= 'F')) ||
  278. ((value >= 'a') && (value <= 'f'));
  279. if (!isHex)
  280. {
  281. return -1;
  282. }
  283. if (value <= '9')
  284. {
  285. return value - '0';
  286. }
  287. else if (value <= 'F')
  288. {
  289. return (value - 'A') + 10;
  290. }
  291. else // a - f
  292. {
  293. return (value - 'a') + 10;
  294. }
  295. }
  296. private static bool SkipUnescape(int value1, int value2, bool isFormEncoding)
  297. {
  298. if (isFormEncoding)
  299. {
  300. return false;
  301. }
  302. // skip %2F - '/'
  303. if (value1 == 2 && value2 == 15)
  304. {
  305. return true;
  306. }
  307. return false;
  308. }
  309. /// <summary>
  310. /// Unescape a URL path
  311. /// </summary>
  312. /// <param name="source">The escape sequences is expected to be well-formed UTF-8 code units.</param>
  313. /// <param name="destination">The char span where unescaped url path is copied to.</param>
  314. /// <returns>The length of the char sequence of the unescaped url path.</returns>
  315. /// <remarks>
  316. /// Form Encoding is not supported compared to the <see cref="DecodeRequestLine(ReadOnlySpan{byte}, Span{byte}, bool)" />
  317. /// for performance gains, as current use-cases does not require it.
  318. /// </remarks>
  319. public static int DecodeRequestLine(ReadOnlySpan<char> source, Span<char> destination)
  320. {
  321. // This requires the destination span to be larger or equal to source span
  322. // which is validated by Span<T>.CopyTo.
  323. source.CopyTo(destination);
  324. return DecodeInPlace(destination.Slice(0, source.Length));
  325. }
  326. /// <summary>
  327. /// Unescape a URL path in place.
  328. /// </summary>
  329. /// <param name="buffer">The escape sequences is expected to be well-formed UTF-8 code units.</param>
  330. /// <returns>The number of the chars representing the result.</returns>
  331. /// <remarks>
  332. /// The unescape is done in place, which means after decoding the result is the subset of
  333. /// the input span.
  334. /// Form Encoding is not supported compared to the <see cref="DecodeInPlace(Span{byte}, bool)" />
  335. /// for performance gains, as current use-cases does not require it.
  336. /// </remarks>
  337. public static int DecodeInPlace(Span<char> buffer)
  338. {
  339. // Compared to the byte overload implementation, this is a different
  340. // by using the first occurrence of % as the starting position both
  341. // for the source and the destination index.
  342. int position = buffer.IndexOf('%');
  343. if (position == -1)
  344. {
  345. return buffer.Length;
  346. }
  347. // the slot to read the input
  348. var sourceIndex = position;
  349. // the slot to write the unescaped char
  350. var destinationIndex = position;
  351. while (true)
  352. {
  353. if (sourceIndex == buffer.Length)
  354. {
  355. break;
  356. }
  357. if (buffer[sourceIndex] == '%')
  358. {
  359. var decodeIndex = sourceIndex;
  360. // If decoding process succeeds, the writer iterator will be moved
  361. // to the next write-ready location. On the other hand if the scanned
  362. // percent-encodings cannot be interpreted as sequence of UTF-8 octets,
  363. // these chars should be copied to output as is.
  364. // The decodeReader iterator is always moved to the first char not yet
  365. // be scanned after the process. A failed decoding means the chars
  366. // between the reader and decodeReader can be copied to output untouched.
  367. if (!DecodeCore(ref decodeIndex, ref destinationIndex, buffer))
  368. {
  369. Copy(sourceIndex, decodeIndex, ref destinationIndex, buffer);
  370. }
  371. sourceIndex = decodeIndex;
  372. }
  373. else
  374. {
  375. buffer[destinationIndex++] = buffer[sourceIndex++];
  376. }
  377. }
  378. return destinationIndex;
  379. }
  380. /// <summary>
  381. /// Unescape the percent-encodings
  382. /// </summary>
  383. /// <param name="sourceIndex">The iterator point to the first % char</param>
  384. /// <param name="destinationIndex">The place to write to</param>
  385. /// <param name="buffer">The char array</param>
  386. private static bool DecodeCore(ref int sourceIndex, ref int destinationIndex, Span<char> buffer)
  387. {
  388. // preserves the original head. if the percent-encodings cannot be interpreted as sequence of UTF-8 octets,
  389. // chars from this till the last scanned one will be copied to the memory pointed by writer.
  390. var codeUnit1 = UnescapePercentEncoding(ref sourceIndex, buffer);
  391. if (codeUnit1 == -1)
  392. {
  393. return false;
  394. }
  395. if (codeUnit1 == 0)
  396. {
  397. throw new InvalidOperationException("The path contains null characters.");
  398. }
  399. if (codeUnit1 <= 0x7F)
  400. {
  401. // first code unit < U+007f, it is a single char ASCII
  402. buffer[destinationIndex++] = (char)codeUnit1;
  403. return true;
  404. }
  405. // anticipate more code units
  406. var currentDecodeBits = 0;
  407. var codeUnitCount = 1;
  408. var expectValueMin = 0;
  409. if ((codeUnit1 & 0xE0) == 0xC0)
  410. {
  411. // 110x xxxx, expect one more code unit
  412. currentDecodeBits = codeUnit1 & 0x1F;
  413. codeUnitCount = 2;
  414. expectValueMin = 0x80;
  415. }
  416. else if ((codeUnit1 & 0xF0) == 0xE0)
  417. {
  418. // 1110 xxxx, expect two more code units
  419. currentDecodeBits = codeUnit1 & 0x0F;
  420. codeUnitCount = 3;
  421. expectValueMin = 0x800;
  422. }
  423. else if ((codeUnit1 & 0xF8) == 0xF0)
  424. {
  425. // 1111 0xxx, expect three more code units
  426. currentDecodeBits = codeUnit1 & 0x07;
  427. codeUnitCount = 4;
  428. expectValueMin = 0x10000;
  429. }
  430. else
  431. {
  432. // invalid first code unit
  433. return false;
  434. }
  435. var remainingCodeUnits = codeUnitCount - 1;
  436. while (remainingCodeUnits > 0)
  437. {
  438. // read following three code units
  439. if (sourceIndex == buffer.Length)
  440. {
  441. return false;
  442. }
  443. var nextSourceIndex = sourceIndex;
  444. var nextCodeUnit = UnescapePercentEncoding(ref nextSourceIndex, buffer);
  445. if (nextCodeUnit == -1)
  446. {
  447. return false;
  448. }
  449. // When UnescapePercentEncoding returns -1 we shall return false.
  450. // For performance reasons, there is no separate if statement for the above check
  451. // as the condition below also returns -1 for that case.
  452. if ((nextCodeUnit & 0xC0) != 0x80)
  453. {
  454. // the follow up code unit is not in form of 10xx xxxx
  455. return false;
  456. }
  457. currentDecodeBits = (currentDecodeBits << 6) | (nextCodeUnit & 0x3F);
  458. remainingCodeUnits--;
  459. sourceIndex = nextSourceIndex;
  460. }
  461. if (currentDecodeBits < expectValueMin)
  462. {
  463. // overlong encoding
  464. return false;
  465. }
  466. if (!System.Text.Rune.TryCreate(currentDecodeBits, out var rune) || !rune.TryEncodeToUtf16(buffer.Slice(destinationIndex), out var charsWritten))
  467. {
  468. // Reasons for this failure could be:
  469. // Value is in the range of 0xD800-0xDFFF UTF-16 surrogates that are not allowed in UTF-8
  470. // Value is above the upper Unicode bound of 0x10FFFF
  471. return false;
  472. }
  473. destinationIndex += charsWritten;
  474. return true;
  475. }
  476. /// <summary>
  477. /// Read the percent-encoding and try unescape it.
  478. ///
  479. /// The operation first peek at the character the <paramref name="scan"/>
  480. /// iterator points at. If it is % the <paramref name="scan"/> is then
  481. /// moved on to scan the following to characters. If the two following
  482. /// characters are hexadecimal literals they will be unescaped and the
  483. /// value will be returned.
  484. ///
  485. /// If the first character is not % the <paramref name="scan"/> iterator
  486. /// will be removed beyond the location of % and -1 will be returned.
  487. ///
  488. /// If the following two characters can't be successfully unescaped the
  489. /// <paramref name="scan"/> iterator will be move behind the % and -1
  490. /// will be returned.
  491. /// </summary>
  492. /// <param name="scan">The value to read</param>
  493. /// <param name="buffer">The char array</param>
  494. /// <returns>The unescaped char if success. Otherwise return -1.</returns>
  495. private static int UnescapePercentEncoding(ref int scan, ReadOnlySpan<char> buffer)
  496. {
  497. int tempIdx = scan++;
  498. if (buffer[tempIdx] != '%')
  499. {
  500. return -1;
  501. }
  502. var probe = scan;
  503. int firstNibble = ReadHex(ref probe, buffer);
  504. int secondNibble = ReadHex(ref probe, buffer);
  505. int value = firstNibble << 4 | secondNibble;
  506. // Skip invalid hex values and %2F - '/'
  507. if (value < 0 || value == '/')
  508. {
  509. return -1;
  510. }
  511. scan = probe;
  512. return value;
  513. }
  514. /// <summary>
  515. /// Read the next char and convert it into hexadecimal value.
  516. ///
  517. /// The <paramref name="scan"/> index will be moved to the next
  518. /// char no matter whether the operation successes.
  519. /// </summary>
  520. /// <param name="scan">The index of the char in the buffer to read</param>
  521. /// <param name="buffer">The char span from which the hex to be read</param>
  522. /// <returns>The hexadecimal value if successes, otherwise -1.</returns>
  523. private static int ReadHex(ref int scan, ReadOnlySpan<char> buffer)
  524. {
  525. // To eliminate boundary checks, using a temporary variable tempIdx.
  526. int tempIdx = scan++;
  527. if ((uint)tempIdx >= (uint)buffer.Length)
  528. {
  529. return -1;
  530. }
  531. int value = buffer[tempIdx];
  532. return FromChar(value);
  533. }
  534. [MethodImpl(MethodImplOptions.AggressiveInlining)]
  535. private static int FromChar(int c)
  536. {
  537. return (uint)c >= (uint)CharToHexLookup.Length ? -1 : CharToHexLookup[c];
  538. }
  539. private static ReadOnlySpan<sbyte> CharToHexLookup => new sbyte[]
  540. {
  541. -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 15
  542. -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 31
  543. -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 47
  544. 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, -1, -1, -1, -1, -1, -1, // 63
  545. -1, 0xA, 0xB, 0xC, 0xD, 0xE, 0xF, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 79
  546. -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 95
  547. -1, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 111
  548. -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 127
  549. -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 143
  550. -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 159
  551. -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 175
  552. -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 191
  553. -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 207
  554. -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 223
  555. -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 239
  556. -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 // 255
  557. };
  558. }
  559. }