gen_archive_string_composition_h.sh 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460
  1. #!/bin/sh
  2. set -eu
  3. if [ $# != 1 ]
  4. then
  5. echo "Usage: $0 path/to/UnicodeData.txt"
  6. exit 1
  7. fi
  8. #
  9. # This needs http://unicode.org/Public/6.0.0/ucd/UnicodeData.txt
  10. #
  11. inputfile="$1" # Expect UnicodeData.txt
  12. outfile=archive_string_composition.h
  13. pickout=/tmp/mk_unicode_composition_tbl$$.awk
  14. pickout2=/tmp/mk_unicode_composition_tbl2$$.awk
  15. #nfdtmp=/tmp/mk_unicode_decomposition_tmp$$.txt
  16. nfdtmp="nfdtmpx"
  17. #################################################################################
  18. #
  19. # Append the file header of "archive_string_composition.h"
  20. #
  21. #################################################################################
  22. append_copyright()
  23. {
  24. cat > ${outfile} <<CR_END
  25. /*-
  26. * Copyright (c) 2011-2012 libarchive Project
  27. * All rights reserved.
  28. *
  29. * Redistribution and use in source and binary forms, with or without
  30. * modification, are permitted provided that the following conditions
  31. * are met:
  32. * 1. Redistributions of source code must retain the above copyright
  33. * notice, this list of conditions and the following disclaimer.
  34. * 2. Redistributions in binary form must reproduce the above copyright
  35. * notice, this list of conditions and the following disclaimer in the
  36. * documentation and/or other materials provided with the distribution.
  37. *
  38. * THIS SOFTWARE IS PROVIDED BY THE AUTHOR(S) ``AS IS'' AND ANY EXPRESS OR
  39. * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  40. * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  41. * IN NO EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY DIRECT, INDIRECT,
  42. * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  43. * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  44. * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  45. * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  46. * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  47. * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  48. */
  49. /*
  50. * ATTENTION!
  51. * This file is generated by build/utils/gen_archive_string_composition_h.sh
  52. * from http://unicode.org/Public/6.0.0/ucd/UnicodeData.txt
  53. *
  54. * See also http://unicode.org/report/tr15/
  55. */
  56. #ifndef __LIBARCHIVE_BUILD
  57. #error This header is only to be used internally to libarchive.
  58. #endif
  59. #ifndef ARCHIVE_STRING_COMPOSITION_H_INCLUDED
  60. #define ARCHIVE_STRING_COMPOSITION_H_INCLUDED
  61. struct unicode_composition_table {
  62. uint32_t cp1;
  63. uint32_t cp2;
  64. uint32_t nfc;
  65. };
  66. CR_END
  67. }
  68. #################################################################################
  69. #
  70. # awk script
  71. #
  72. #################################################################################
  73. cat > ${pickout} <<AWK_END
  74. #
  75. BEGIN {
  76. FS = ";"
  77. min = "";
  78. max = "";
  79. cmd="sort | awk -F ' ' '{printf \"\\\\t{ 0x%s , 0x%s , 0x%s },\\\\n\",\$1,\$2,\$3}'"
  80. nfdtbl="${nfdtmp}"
  81. print "static const struct unicode_composition_table u_composition_table[] = {"
  82. }
  83. END {
  84. close(cmd)
  85. print "};"
  86. print ""
  87. #
  88. # Output Canonical Combining Class tables used for translating NFD to NFC.
  89. #
  90. printf "#define CANONICAL_CLASS_MIN\\t0x%s\\n", min
  91. printf "#define CANONICAL_CLASS_MAX\\t0x%s\\n", max
  92. print ""
  93. printf "#define IS_DECOMPOSABLE_BLOCK(uc)\\t\\\\\n"
  94. printf "\\t(((uc)>>8) <= 0x%X && u_decomposable_blocks[(uc)>>8])\\n", highnum
  95. printf "static const char u_decomposable_blocks[0x%X+1] = {\\n\\t", highnum
  96. #
  97. # Output blockmap
  98. for (i = 0; i <= highnum; i++) {
  99. if (i != 0 && i % 32 == 0)
  100. printf "\\n\\t"
  101. # Additionally Hangul[11XX(17), AC00(172) - D7FF(215)] is decomposable.
  102. if (blockmap[i] || i == 17 || (i >= 172 && i <= 215))
  103. printf "1,"
  104. else
  105. printf "0,"
  106. }
  107. printf "\\n};\\n\\n"
  108. #
  109. # Output a macro to get a canonical combining class.
  110. #
  111. print "/* Get Canonical Combining Class(CCC). */"
  112. printf "#define CCC(uc)\\t\\\\\n"
  113. printf "\\t(((uc) > 0x%s)?0:\\\\\\n", max
  114. printf "\\tccc_val[ccc_val_index[ccc_index[(uc)>>8]][((uc)>>4)&0x0F]][(uc)&0x0F])\\n"
  115. print ""
  116. #
  117. # Output a canonical combining class value table.
  118. #
  119. midcnt = 0
  120. printf "/* The table of the value of Canonical Cimbining Class */\\n"
  121. print "static const unsigned char ccc_val[][16] = {"
  122. print " /* idx=0: XXXX0 - XXXXF */"
  123. print " { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },"
  124. for (h = 0; h <= highnum; h++) {
  125. if (!blockmap[h])
  126. continue;
  127. for (m = 0; m < 16; m++) {
  128. if (!xx_blockmap[h, m])
  129. continue;
  130. midcnt++
  131. printf " /* idx=%d: %03X%1X0 - %03X%1XF */\\n {", midcnt, h, m, h, m
  132. for (l = 0; l < 15; l++) {
  133. printf "%d, ", xxx_blockmap[h, m, l]
  134. }
  135. printf "%d },\n", xxx_blockmap[h, m, 15]
  136. }
  137. }
  138. printf "};\n"
  139. #
  140. # Output the index table of the canonical combining class value table.
  141. #
  142. cnt = 0
  143. midcnt = 0
  144. printf "\\n/* The index table to ccc_val[*][16] */\\n"
  145. print "static const unsigned char ccc_val_index[][16] = {"
  146. print " /* idx=0: XXX00 - XXXFF */"
  147. print " { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },"
  148. for (h = 0; h <= highnum; h++) {
  149. if (!blockmap[h])
  150. continue;
  151. cnt++
  152. printf " /* idx=%d: %03X00 - %03XFF */\\n {", cnt, h, h
  153. for (m = 0; m < 16; m++) {
  154. if (m != 0)
  155. printf ","
  156. if (xx_blockmap[h, m]) {
  157. midcnt++
  158. printf "%2d", midcnt
  159. } else
  160. printf " 0"
  161. }
  162. printf " },\\n"
  163. }
  164. printf "};\\n"
  165. #
  166. # Output the index table to the index table of the canonical combining
  167. # class value table.
  168. #
  169. printf "\\n/* The index table to ccc_val_index[*][16] */\\n"
  170. printf "static const unsigned char ccc_index[] = {\\n ", h
  171. cnt = 0
  172. for (h = 0; h <= highnum; h++) {
  173. if (h != 0 && h % 24 == 0)
  174. printf "\\n "
  175. if (blockmap[h]) {
  176. cnt++;
  177. printf "%2d,", cnt
  178. } else
  179. printf " 0,"
  180. }
  181. print "};"
  182. print ""
  183. }
  184. #
  185. #
  186. function hextoi(hex)
  187. {
  188. dec = 0
  189. for (i=0; i < length(hex); i++) {
  190. x = substr(hex, i+1, 1)
  191. if (x ~/[0-9]/)
  192. dec = dec * 16 + x;
  193. else if (x == "A")
  194. dec = dec * 16 + 10;
  195. else if (x == "B")
  196. dec = dec * 16 + 11;
  197. else if (x == "C")
  198. dec = dec * 16 + 12;
  199. else if (x == "D")
  200. dec = dec * 16 + 13;
  201. else if (x == "E")
  202. dec = dec * 16 + 14;
  203. else if (x == "F")
  204. dec = dec * 16 + 15;
  205. }
  206. return dec
  207. }
  208. #
  209. # Collect Canonical Combining Class values.
  210. #
  211. \$4 ~/^[0-9A-F]+$/ {
  212. if (\$4 !~/^0$/) {
  213. if (min == "") {
  214. min = \$1
  215. }
  216. max = \$1
  217. high = substr(\$1, 1, length(\$1) -2)
  218. highnum = hextoi(high)
  219. mid = substr(\$1, length(\$1) -1, 1)
  220. midnum = hextoi(mid)
  221. low = substr(\$1, length(\$1), 1)
  222. lownum = hextoi(low)
  223. blockmap[highnum] = 1
  224. xx_blockmap[highnum, midnum] = 1
  225. xxx_blockmap[highnum, midnum, lownum] = \$4
  226. }
  227. }
  228. #
  229. # Following code points are not decomposed in MAC OS.
  230. # U+2000 - U+2FFF
  231. # U+F900 - U+FAFF
  232. # U+2F800 - U+2FAFF
  233. #
  234. #\$1 ~/^2[0-9A-F][0-9A-F][0-9A-F]\$/ {
  235. # next
  236. #}
  237. #\$1 ~/^F[9A][0-9A-F][0-9A-F]\$/ {
  238. # next
  239. #}
  240. #\$1 ~/^2F[89A][0-9A-F][0-9A-F]\$/ {
  241. # next
  242. #}
  243. #
  244. # Exclusion code points specified by
  245. # http://unicode.org/Public/6.0.0/ucd/CompositionExclusions.txt
  246. ##
  247. # 1. Script Specifics
  248. ##
  249. \$1 ~/^095[89ABCDEF]\$/ {
  250. next
  251. }
  252. \$1 ~/^09D[CDF]\$/ {
  253. next
  254. }
  255. \$1 ~/^0A3[36]\$/ {
  256. next
  257. }
  258. \$1 ~/^0A5[9ABE]\$/ {
  259. next
  260. }
  261. \$1 ~/^0B5[CD]\$/ {
  262. next
  263. }
  264. \$1 ~/^0F4[3D]\$/ {
  265. next
  266. }
  267. \$1 ~/^0F5[27C]\$/ {
  268. next
  269. }
  270. \$1 ~/^0F69\$/ {
  271. next
  272. }
  273. \$1 ~/^0F7[68]\$/ {
  274. next
  275. }
  276. \$1 ~/^0F9[3D]\$/ {
  277. next
  278. }
  279. \$1 ~/^0FA[27C]\$/ {
  280. next
  281. }
  282. \$1 ~/^0FB9\$/ {
  283. next
  284. }
  285. \$1 ~/^FB1[DF]\$/ {
  286. next
  287. }
  288. \$1 ~/^FB2[ABCDEF]\$/ {
  289. next
  290. }
  291. \$1 ~/^FB3[012345689ABCE]\$/ {
  292. next
  293. }
  294. \$1 ~/^FB4[01346789ABCDE]\$/ {
  295. next
  296. }
  297. ##
  298. # 2. Post Composition Version precomposed characters
  299. ##
  300. \$1 ~/^2ADC\$/ {
  301. next
  302. }
  303. \$1 ~/^1D15[EF]\$/ {
  304. next
  305. }
  306. \$1 ~/^1D16[01234]\$/ {
  307. next
  308. }
  309. \$1 ~/^1D1B[BCDEF]\$/ {
  310. next
  311. }
  312. \$1 ~/^1D1C0\$/ {
  313. next
  314. }
  315. ##
  316. # 3. Singleton Decompositions
  317. ##
  318. \$1 ~/^034[01]\$/ {
  319. next
  320. }
  321. \$1 ~/^037[4E]\$/ {
  322. next
  323. }
  324. \$1 ~/^0387\$/ {
  325. next
  326. }
  327. \$1 ~/^1F7[13579BD]\$/ {
  328. next
  329. }
  330. \$1 ~/^1FB[BE]\$/ {
  331. next
  332. }
  333. \$1 ~/^1FC[9B]\$/ {
  334. next
  335. }
  336. \$1 ~/^1FD[3B]\$/ {
  337. next
  338. }
  339. \$1 ~/^1FE[3BEF]\$/ {
  340. next
  341. }
  342. \$1 ~/^1FF[9BD]\$/ {
  343. next
  344. }
  345. \$1 ~/^200[01]\$/ {
  346. next
  347. }
  348. \$1 ~/^212[6AB]\$/ {
  349. next
  350. }
  351. \$1 ~/^232[9A]\$/ {
  352. next
  353. }
  354. \$1 ~/^F9[0-9A-F][0-9A-F]\$/ {
  355. next
  356. }
  357. \$1 ~/^FA0[0-9A-D]\$/ {
  358. next
  359. }
  360. \$1 ~/^FA1[025-9A-E]\$/ {
  361. next
  362. }
  363. \$1 ~/^FA2[0256A-D]\$/ {
  364. next
  365. }
  366. \$1 ~/^FA[3-5][0-9A-F]\$/ {
  367. next
  368. }
  369. \$1 ~/^FA6[0-9A-D]\$/ {
  370. next
  371. }
  372. \$1 ~/^FA[7-9A-C][0-9A-F]\$/ {
  373. next
  374. }
  375. \$1 ~/^FAD[0-9]\$/ {
  376. next
  377. }
  378. \$1 ~/^2F[89][0-9A-F][0-9A-F]\$/ {
  379. next
  380. }
  381. \$1 ~/^2FA0[0-9A-F]\$/ {
  382. next
  383. }
  384. \$1 ~/^2FA1[0-9A-D]\$/ {
  385. next
  386. }
  387. ##
  388. # 4. Non-Starter Decompositions
  389. ##
  390. \$1 ~/^0344\$/ {
  391. next
  392. }
  393. \$1 ~/^0F7[35]\$/ {
  394. next
  395. }
  396. \$1 ~/^0F81\$/ {
  397. next
  398. }
  399. #
  400. # Output combinations for NFD ==> NFC.
  401. #
  402. \$6 ~/^[0-9A-F]+ [0-9A-F]+\$/ {
  403. split(\$6, cp, " ")
  404. if (length(\$1) == 4)
  405. print "0"cp[1], "0"cp[2], "0"\$1 | cmd
  406. else
  407. print cp[1], cp[2], \$1 | cmd
  408. # NFC ==> NFD table.
  409. if (length(\$1) == 4)
  410. print "0"\$1, "0"cp[1], "0"cp[2] >>nfdtbl
  411. else
  412. print \$1, cp[1], cp[2] >>nfdtbl
  413. }
  414. AWK_END
  415. #################################################################################
  416. # awk script
  417. #
  418. #################################################################################
  419. cat > ${pickout2} <<AWK_END
  420. #
  421. BEGIN {
  422. FS = " "
  423. print "struct unicode_decomposition_table {"
  424. print "\tuint32_t nfc;"
  425. print "\tuint32_t cp1;"
  426. print "\tuint32_t cp2;"
  427. print "};"
  428. print ""
  429. print "static const struct unicode_decomposition_table u_decomposition_table[] = {"
  430. }
  431. END {
  432. print "};"
  433. print ""
  434. }
  435. {
  436. printf "\t{ 0x%s , 0x%s , 0x%s },\n", \$1, \$2, \$3;
  437. }
  438. AWK_END
  439. #################################################################################
  440. #
  441. # Run awk a script.
  442. #
  443. #################################################################################
  444. append_copyright
  445. awk -f ${pickout} ${inputfile} >> ${outfile}
  446. awk -f ${pickout2} ${nfdtmp} >> ${outfile}
  447. echo "#endif /* ARCHIVE_STRING_COMPOSITION_H_INCLUDED */" >> ${outfile}
  448. echo "" >> ${outfile}
  449. #
  450. # Remove awk the script.
  451. rm ${pickout}
  452. rm ${pickout2}
  453. rm ${nfdtmp}