gen_archive_string_composition_h.sh 10.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455
  1. #!/bin/sh
  2. #
  3. # This needs http://unicode.org/Public/6.0.0/ucd/UnicodeData.txt
  4. #
  5. inputfile="$1" # Expect UnicodeData.txt
  6. outfile=archive_string_composition.h
  7. pickout=/tmp/mk_unicode_composition_tbl$$.awk
  8. pickout2=/tmp/mk_unicode_composition_tbl2$$.awk
  9. #nfdtmp=/tmp/mk_unicode_decomposition_tmp$$.txt
  10. nfdtmp="nfdtmpx"
  11. #################################################################################
  12. #
  13. # Append the file header of "archive_string_composition.h"
  14. #
  15. #################################################################################
  16. append_copyright()
  17. {
  18. cat > ${outfile} <<CR_END
  19. /*-
  20. * Copyright (c) 2011-2012 libarchive Project
  21. * All rights reserved.
  22. *
  23. * Redistribution and use in source and binary forms, with or without
  24. * modification, are permitted provided that the following conditions
  25. * are met:
  26. * 1. Redistributions of source code must retain the above copyright
  27. * notice, this list of conditions and the following disclaimer.
  28. * 2. Redistributions in binary form must reproduce the above copyright
  29. * notice, this list of conditions and the following disclaimer in the
  30. * documentation and/or other materials provided with the distribution.
  31. *
  32. * THIS SOFTWARE IS PROVIDED BY THE AUTHOR(S) ``AS IS'' AND ANY EXPRESS OR
  33. * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  34. * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  35. * IN NO EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY DIRECT, INDIRECT,
  36. * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  37. * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  38. * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  39. * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  40. * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  41. * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  42. *
  43. * \$FreeBSD\$
  44. *
  45. */
  46. /*
  47. * ATTENTION!
  48. * This file is generated by build/utils/gen_archive_string_composition_h.sh
  49. * from http://unicode.org/Public/6.0.0/ucd/UnicodeData.txt
  50. *
  51. * See also http://unicode.org/report/tr15/
  52. */
  53. #ifndef __LIBARCHIVE_BUILD
  54. #error This header is only to be used internally to libarchive.
  55. #endif
  56. #ifndef ARCHIVE_STRING_COMPOSITION_H_INCLUDED
  57. #define ARCHIVE_STRING_COMPOSITION_H_INCLUDED
  58. struct unicode_composition_table {
  59. uint32_t cp1;
  60. uint32_t cp2;
  61. uint32_t nfc;
  62. };
  63. CR_END
  64. }
  65. #################################################################################
  66. #
  67. # awk script
  68. #
  69. #################################################################################
  70. cat > ${pickout} <<AWK_END
  71. #
  72. BEGIN {
  73. FS = ";"
  74. min = "";
  75. max = "";
  76. cmd="sort | awk -F ' ' '{printf \"\\\\t{ 0x%s , 0x%s , 0x%s },\\\\n\",\$1,\$2,\$3}'"
  77. nfdtbl="${nfdtmp}"
  78. print "static const struct unicode_composition_table u_composition_table[] = {"
  79. }
  80. END {
  81. close(cmd)
  82. print "};"
  83. print ""
  84. #
  85. # Output Canonical Combining Class tables used for translating NFD to NFC.
  86. #
  87. printf "#define CANONICAL_CLASS_MIN\\t0x%s\\n", min
  88. printf "#define CANONICAL_CLASS_MAX\\t0x%s\\n", max
  89. print ""
  90. printf "#define IS_DECOMPOSABLE_BLOCK(uc)\\t\\\\\n"
  91. printf "\\t(((uc)>>8) <= 0x%X && u_decomposable_blocks[(uc)>>8])\\n", highnum
  92. printf "static const char u_decomposable_blocks[0x%X+1] = {\\n\\t", highnum
  93. #
  94. # Output blockmap
  95. for (i = 0; i <= highnum; i++) {
  96. if (i != 0 && i % 32 == 0)
  97. printf "\\n\\t"
  98. # Additionally Hangul[11XX(17), AC00(172) - D7FF(215)] is decomposable.
  99. if (blockmap[i] || i == 17 || (i >= 172 && i <= 215))
  100. printf "1,"
  101. else
  102. printf "0,"
  103. }
  104. printf "\\n};\\n\\n"
  105. #
  106. # Output a macro to get a canonical combining class.
  107. #
  108. print "/* Get Canonical Combining Class(CCC). */"
  109. printf "#define CCC(uc)\\t\\\\\n"
  110. printf "\\t(((uc) > 0x%s)?0:\\\\\\n", max
  111. printf "\\tccc_val[ccc_val_index[ccc_index[(uc)>>8]][((uc)>>4)&0x0F]][(uc)&0x0F])\\n"
  112. print ""
  113. #
  114. # Output a canonical combining class value table.
  115. #
  116. midcnt = 0
  117. printf "/* The table of the value of Canonical Cimbining Class */\\n"
  118. print "static const unsigned char ccc_val[][16] = {"
  119. print " /* idx=0: XXXX0 - XXXXF */"
  120. print " { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },"
  121. for (h = 0; h <= highnum; h++) {
  122. if (!blockmap[h])
  123. continue;
  124. for (m = 0; m < 16; m++) {
  125. if (!xx_blockmap[h, m])
  126. continue;
  127. midcnt++
  128. printf " /* idx=%d: %03X%1X0 - %03X%1XF */\\n {", midcnt, h, m, h, m
  129. for (l = 0; l < 15; l++) {
  130. printf "%d, ", xxx_blockmap[h, m, l]
  131. }
  132. printf "%d },\n", xxx_blockmap[h, m, 15]
  133. }
  134. }
  135. printf "};\n"
  136. #
  137. # Output the index table of the canonical combining class value table.
  138. #
  139. cnt = 0
  140. midcnt = 0
  141. printf "\\n/* The index table to ccc_val[*][16] */\\n"
  142. print "static const unsigned char ccc_val_index[][16] = {"
  143. print " /* idx=0: XXX00 - XXXFF */"
  144. print " { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },"
  145. for (h = 0; h <= highnum; h++) {
  146. if (!blockmap[h])
  147. continue;
  148. cnt++
  149. printf " /* idx=%d: %03X00 - %03XFF */\\n {", cnt, h, h
  150. for (m = 0; m < 16; m++) {
  151. if (m != 0)
  152. printf ","
  153. if (xx_blockmap[h, m]) {
  154. midcnt++
  155. printf "%2d", midcnt
  156. } else
  157. printf " 0"
  158. }
  159. printf " },\\n"
  160. }
  161. printf "};\\n"
  162. #
  163. # Output the index table to the index table of the canonical combining
  164. # class value table.
  165. #
  166. printf "\\n/* The index table to ccc_val_index[*][16] */\\n"
  167. printf "static const unsigned char ccc_index[] = {\\n ", h
  168. cnt = 0
  169. for (h = 0; h <= highnum; h++) {
  170. if (h != 0 && h % 24 == 0)
  171. printf "\\n "
  172. if (blockmap[h]) {
  173. cnt++;
  174. printf "%2d,", cnt
  175. } else
  176. printf " 0,"
  177. }
  178. print "};"
  179. print ""
  180. }
  181. #
  182. #
  183. function hextoi(hex)
  184. {
  185. dec = 0
  186. for (i=0; i < length(hex); i++) {
  187. x = substr(hex, i+1, 1)
  188. if (x ~/[0-9]/)
  189. dec = dec * 16 + x;
  190. else if (x == "A")
  191. dec = dec * 16 + 10;
  192. else if (x == "B")
  193. dec = dec * 16 + 11;
  194. else if (x == "C")
  195. dec = dec * 16 + 12;
  196. else if (x == "D")
  197. dec = dec * 16 + 13;
  198. else if (x == "E")
  199. dec = dec * 16 + 14;
  200. else if (x == "F")
  201. dec = dec * 16 + 15;
  202. }
  203. return dec
  204. }
  205. #
  206. # Collect Canonical Combining Class values.
  207. #
  208. \$4 ~/^[0-9A-F]+$/ {
  209. if (\$4 !~/^0$/) {
  210. if (min == "") {
  211. min = \$1
  212. }
  213. max = \$1
  214. high = substr(\$1, 1, length(\$1) -2)
  215. highnum = hextoi(high)
  216. mid = substr(\$1, length(\$1) -1, 1)
  217. midnum = hextoi(mid)
  218. low = substr(\$1, length(\$1), 1)
  219. lownum = hextoi(low)
  220. blockmap[highnum] = 1
  221. xx_blockmap[highnum, midnum] = 1
  222. xxx_blockmap[highnum, midnum, lownum] = \$4
  223. }
  224. }
  225. #
  226. # Following code points are not decomposed in MAC OS.
  227. # U+2000 - U+2FFF
  228. # U+F900 - U+FAFF
  229. # U+2F800 - U+2FAFF
  230. #
  231. #\$1 ~/^2[0-9A-F][0-9A-F][0-9A-F]\$/ {
  232. # next
  233. #}
  234. #\$1 ~/^F[9A][0-9A-F][0-9A-F]\$/ {
  235. # next
  236. #}
  237. #\$1 ~/^2F[89A][0-9A-F][0-9A-F]\$/ {
  238. # next
  239. #}
  240. #
  241. # Exclusion code points specified by
  242. # http://unicode.org/Public/6.0.0/ucd/CompositionExclusions.txt
  243. ##
  244. # 1. Script Specifices
  245. ##
  246. \$1 ~/^095[89ABCDEF]\$/ {
  247. next
  248. }
  249. \$1 ~/^09D[CDF]\$/ {
  250. next
  251. }
  252. \$1 ~/^0A3[36]\$/ {
  253. next
  254. }
  255. \$1 ~/^0A5[9ABE]\$/ {
  256. next
  257. }
  258. \$1 ~/^0B5[CD]\$/ {
  259. next
  260. }
  261. \$1 ~/^0F4[3D]\$/ {
  262. next
  263. }
  264. \$1 ~/^0F5[27C]\$/ {
  265. next
  266. }
  267. \$1 ~/^0F69\$/ {
  268. next
  269. }
  270. \$1 ~/^0F7[68]\$/ {
  271. next
  272. }
  273. \$1 ~/^0F9[3D]\$/ {
  274. next
  275. }
  276. \$1 ~/^0FA[27C]\$/ {
  277. next
  278. }
  279. \$1 ~/^0FB9\$/ {
  280. next
  281. }
  282. \$1 ~/^FB1[DF]\$/ {
  283. next
  284. }
  285. \$1 ~/^FB2[ABCDEF]\$/ {
  286. next
  287. }
  288. \$1 ~/^FB3[012345689ABCE]\$/ {
  289. next
  290. }
  291. \$1 ~/^FB4[01346789ABCDE]\$/ {
  292. next
  293. }
  294. ##
  295. # 2. Post Composition Version precomposed characters
  296. ##
  297. \$1 ~/^2ADC\$/ {
  298. next
  299. }
  300. \$1 ~/^1D15[EF]\$/ {
  301. next
  302. }
  303. \$1 ~/^1D16[01234]\$/ {
  304. next
  305. }
  306. \$1 ~/^1D1B[BCDEF]\$/ {
  307. next
  308. }
  309. \$1 ~/^1D1C0\$/ {
  310. next
  311. }
  312. ##
  313. # 3. Singleton Decompositions
  314. ##
  315. \$1 ~/^034[01]\$/ {
  316. next
  317. }
  318. \$1 ~/^037[4E]\$/ {
  319. next
  320. }
  321. \$1 ~/^0387\$/ {
  322. next
  323. }
  324. \$1 ~/^1F7[13579BD]\$/ {
  325. next
  326. }
  327. \$1 ~/^1FB[BE]\$/ {
  328. next
  329. }
  330. \$1 ~/^1FC[9B]\$/ {
  331. next
  332. }
  333. \$1 ~/^1FD[3B]\$/ {
  334. next
  335. }
  336. \$1 ~/^1FE[3BEF]\$/ {
  337. next
  338. }
  339. \$1 ~/^1FF[9BD]\$/ {
  340. next
  341. }
  342. \$1 ~/^200[01]\$/ {
  343. next
  344. }
  345. \$1 ~/^212[6AB]\$/ {
  346. next
  347. }
  348. \$1 ~/^232[9A]\$/ {
  349. next
  350. }
  351. \$1 ~/^F9[0-9A-F][0-9A-F]\$/ {
  352. next
  353. }
  354. \$1 ~/^FA0[0-9A-D]\$/ {
  355. next
  356. }
  357. \$1 ~/^FA1[025-9A-E]\$/ {
  358. next
  359. }
  360. \$1 ~/^FA2[0256A-D]\$/ {
  361. next
  362. }
  363. \$1 ~/^FA[3-5][0-9A-F]\$/ {
  364. next
  365. }
  366. \$1 ~/^FA6[0-9A-D]\$/ {
  367. next
  368. }
  369. \$1 ~/^FA[7-9A-C][0-9A-F]\$/ {
  370. next
  371. }
  372. \$1 ~/^FAD[0-9]\$/ {
  373. next
  374. }
  375. \$1 ~/^2F[89][0-9A-F][0-9A-F]\$/ {
  376. next
  377. }
  378. \$1 ~/^2FA0[0-9A-F]\$/ {
  379. next
  380. }
  381. \$1 ~/^2FA1[0-9A-D]\$/ {
  382. next
  383. }
  384. ##
  385. # 4. Non-Starter Decompositions
  386. ##
  387. \$1 ~/^0344\$/ {
  388. next
  389. }
  390. \$1 ~/^0F7[35]\$/ {
  391. next
  392. }
  393. \$1 ~/^0F81\$/ {
  394. next
  395. }
  396. #
  397. # Output combinations for NFD ==> NFC.
  398. #
  399. \$6 ~/^[0-9A-F]+ [0-9A-F]+\$/ {
  400. split(\$6, cp, " ")
  401. if (length(\$1) == 4)
  402. print "0"cp[1], "0"cp[2], "0"\$1 | cmd
  403. else
  404. print cp[1], cp[2], \$1 | cmd
  405. # NFC ==> NFD table.
  406. if (length(\$1) == 4)
  407. print "0"\$1, "0"cp[1], "0"cp[2] >>nfdtbl
  408. else
  409. print \$1, cp[1], cp[2] >>nfdtbl
  410. }
  411. AWK_END
  412. #################################################################################
  413. # awk script
  414. #
  415. #################################################################################
  416. cat > ${pickout2} <<AWK_END
  417. #
  418. BEGIN {
  419. FS = " "
  420. print "struct unicode_decomposition_table {"
  421. print "\tuint32_t nfc;"
  422. print "\tuint32_t cp1;"
  423. print "\tuint32_t cp2;"
  424. print "};"
  425. print ""
  426. print "static const struct unicode_decomposition_table u_decomposition_table[] = {"
  427. }
  428. END {
  429. print "};"
  430. print ""
  431. }
  432. {
  433. printf "\t{ 0x%s , 0x%s , 0x%s },\n", \$1, \$2, \$3;
  434. }
  435. AWK_END
  436. #################################################################################
  437. #
  438. # Run awk a script.
  439. #
  440. #################################################################################
  441. append_copyright
  442. awk -f ${pickout} ${inputfile} >> ${outfile}
  443. awk -f ${pickout2} ${nfdtmp} >> ${outfile}
  444. echo "#endif /* ARCHIVE_STRING_COMPOSITION_H_INCLUDED */" >> ${outfile}
  445. echo "" >> ${outfile}
  446. #
  447. # Remove awk the script.
  448. rm ${pickout}
  449. rm ${pickout2}
  450. rm ${nfdtmp}