lzma12.h 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568
  1. /* SPDX-License-Identifier: 0BSD */
  2. /**
  3. * \file lzma/lzma12.h
  4. * \brief LZMA1 and LZMA2 filters
  5. * \note Never include this file directly. Use <lzma.h> instead.
  6. */
  7. /*
  8. * Author: Lasse Collin
  9. */
  10. #ifndef LZMA_H_INTERNAL
  11. # error Never include this file directly. Use <lzma.h> instead.
  12. #endif
  13. /**
  14. * \brief LZMA1 Filter ID (for raw encoder/decoder only, not in .xz)
  15. *
  16. * LZMA1 is the very same thing as what was called just LZMA in LZMA Utils,
  17. * 7-Zip, and LZMA SDK. It's called LZMA1 here to prevent developers from
  18. * accidentally using LZMA when they actually want LZMA2.
  19. */
  20. #define LZMA_FILTER_LZMA1 LZMA_VLI_C(0x4000000000000001)
  21. /**
  22. * \brief LZMA1 Filter ID with extended options (for raw encoder/decoder)
  23. *
  24. * This is like LZMA_FILTER_LZMA1 but with this ID a few extra options
  25. * are supported in the lzma_options_lzma structure:
  26. *
  27. * - A flag to tell the encoder if the end of payload marker (EOPM) alias
  28. * end of stream (EOS) marker must be written at the end of the stream.
  29. * In contrast, LZMA_FILTER_LZMA1 always writes the end marker.
  30. *
  31. * - Decoder needs to be told the uncompressed size of the stream
  32. * or that it is unknown (using the special value UINT64_MAX).
  33. * If the size is known, a flag can be set to allow the presence of
  34. * the end marker anyway. In contrast, LZMA_FILTER_LZMA1 always
  35. * behaves as if the uncompressed size was unknown.
  36. *
  37. * This allows handling file formats where LZMA1 streams are used but where
  38. * the end marker isn't allowed or where it might not (always) be present.
  39. * This extended LZMA1 functionality is provided as a Filter ID for raw
  40. * encoder and decoder instead of adding new encoder and decoder initialization
  41. * functions because this way it is possible to also use extra filters,
  42. * for example, LZMA_FILTER_X86 in a filter chain with LZMA_FILTER_LZMA1EXT,
  43. * which might be needed to handle some file formats.
  44. */
  45. #define LZMA_FILTER_LZMA1EXT LZMA_VLI_C(0x4000000000000002)
  46. /**
  47. * \brief LZMA2 Filter ID
  48. *
  49. * Usually you want this instead of LZMA1. Compared to LZMA1, LZMA2 adds
  50. * support for LZMA_SYNC_FLUSH, uncompressed chunks (smaller expansion
  51. * when trying to compress incompressible data), possibility to change
  52. * lc/lp/pb in the middle of encoding, and some other internal improvements.
  53. */
  54. #define LZMA_FILTER_LZMA2 LZMA_VLI_C(0x21)
  55. /**
  56. * \brief Match finders
  57. *
  58. * Match finder has major effect on both speed and compression ratio.
  59. * Usually hash chains are faster than binary trees.
  60. *
  61. * If you will use LZMA_SYNC_FLUSH often, the hash chains may be a better
  62. * choice, because binary trees get much higher compression ratio penalty
  63. * with LZMA_SYNC_FLUSH.
  64. *
  65. * The memory usage formulas are only rough estimates, which are closest to
  66. * reality when dict_size is a power of two. The formulas are more complex
  67. * in reality, and can also change a little between liblzma versions. Use
  68. * lzma_raw_encoder_memusage() to get more accurate estimate of memory usage.
  69. */
  70. typedef enum {
  71. LZMA_MF_HC3 = 0x03,
  72. /**<
  73. * \brief Hash Chain with 2- and 3-byte hashing
  74. *
  75. * Minimum nice_len: 3
  76. *
  77. * Memory usage:
  78. * - dict_size <= 16 MiB: dict_size * 7.5
  79. * - dict_size > 16 MiB: dict_size * 5.5 + 64 MiB
  80. */
  81. LZMA_MF_HC4 = 0x04,
  82. /**<
  83. * \brief Hash Chain with 2-, 3-, and 4-byte hashing
  84. *
  85. * Minimum nice_len: 4
  86. *
  87. * Memory usage:
  88. * - dict_size <= 32 MiB: dict_size * 7.5
  89. * - dict_size > 32 MiB: dict_size * 6.5
  90. */
  91. LZMA_MF_BT2 = 0x12,
  92. /**<
  93. * \brief Binary Tree with 2-byte hashing
  94. *
  95. * Minimum nice_len: 2
  96. *
  97. * Memory usage: dict_size * 9.5
  98. */
  99. LZMA_MF_BT3 = 0x13,
  100. /**<
  101. * \brief Binary Tree with 2- and 3-byte hashing
  102. *
  103. * Minimum nice_len: 3
  104. *
  105. * Memory usage:
  106. * - dict_size <= 16 MiB: dict_size * 11.5
  107. * - dict_size > 16 MiB: dict_size * 9.5 + 64 MiB
  108. */
  109. LZMA_MF_BT4 = 0x14
  110. /**<
  111. * \brief Binary Tree with 2-, 3-, and 4-byte hashing
  112. *
  113. * Minimum nice_len: 4
  114. *
  115. * Memory usage:
  116. * - dict_size <= 32 MiB: dict_size * 11.5
  117. * - dict_size > 32 MiB: dict_size * 10.5
  118. */
  119. } lzma_match_finder;
  120. /**
  121. * \brief Test if given match finder is supported
  122. *
  123. * It is safe to call this with a value that isn't listed in
  124. * lzma_match_finder enumeration; the return value will be false.
  125. *
  126. * There is no way to list which match finders are available in this
  127. * particular liblzma version and build. It would be useless, because
  128. * a new match finder, which the application developer wasn't aware,
  129. * could require giving additional options to the encoder that the older
  130. * match finders don't need.
  131. *
  132. * \param match_finder Match finder ID
  133. *
  134. * \return lzma_bool:
  135. * - true if the match finder is supported by this liblzma build.
  136. * - false otherwise.
  137. */
  138. extern LZMA_API(lzma_bool) lzma_mf_is_supported(lzma_match_finder match_finder)
  139. lzma_nothrow lzma_attr_const;
  140. /**
  141. * \brief Compression modes
  142. *
  143. * This selects the function used to analyze the data produced by the match
  144. * finder.
  145. */
  146. typedef enum {
  147. LZMA_MODE_FAST = 1,
  148. /**<
  149. * \brief Fast compression
  150. *
  151. * Fast mode is usually at its best when combined with
  152. * a hash chain match finder.
  153. */
  154. LZMA_MODE_NORMAL = 2
  155. /**<
  156. * \brief Normal compression
  157. *
  158. * This is usually notably slower than fast mode. Use this
  159. * together with binary tree match finders to expose the
  160. * full potential of the LZMA1 or LZMA2 encoder.
  161. */
  162. } lzma_mode;
  163. /**
  164. * \brief Test if given compression mode is supported
  165. *
  166. * It is safe to call this with a value that isn't listed in lzma_mode
  167. * enumeration; the return value will be false.
  168. *
  169. * There is no way to list which modes are available in this particular
  170. * liblzma version and build. It would be useless, because a new compression
  171. * mode, which the application developer wasn't aware, could require giving
  172. * additional options to the encoder that the older modes don't need.
  173. *
  174. * \param mode Mode ID.
  175. *
  176. * \return lzma_bool:
  177. * - true if the compression mode is supported by this liblzma
  178. * build.
  179. * - false otherwise.
  180. */
  181. extern LZMA_API(lzma_bool) lzma_mode_is_supported(lzma_mode mode)
  182. lzma_nothrow lzma_attr_const;
  183. /**
  184. * \brief Options specific to the LZMA1 and LZMA2 filters
  185. *
  186. * Since LZMA1 and LZMA2 share most of the code, it's simplest to share
  187. * the options structure too. For encoding, all but the reserved variables
  188. * need to be initialized unless specifically mentioned otherwise.
  189. * lzma_lzma_preset() can be used to get a good starting point.
  190. *
  191. * For raw decoding, both LZMA1 and LZMA2 need dict_size, preset_dict, and
  192. * preset_dict_size (if preset_dict != NULL). LZMA1 needs also lc, lp, and pb.
  193. */
  194. typedef struct {
  195. /**
  196. * \brief Dictionary size in bytes
  197. *
  198. * Dictionary size indicates how many bytes of the recently processed
  199. * uncompressed data is kept in memory. One method to reduce size of
  200. * the uncompressed data is to store distance-length pairs, which
  201. * indicate what data to repeat from the dictionary buffer. Thus,
  202. * the bigger the dictionary, the better the compression ratio
  203. * usually is.
  204. *
  205. * Maximum size of the dictionary depends on multiple things:
  206. * - Memory usage limit
  207. * - Available address space (not a problem on 64-bit systems)
  208. * - Selected match finder (encoder only)
  209. *
  210. * Currently the maximum dictionary size for encoding is 1.5 GiB
  211. * (i.e. (UINT32_C(1) << 30) + (UINT32_C(1) << 29)) even on 64-bit
  212. * systems for certain match finder implementation reasons. In the
  213. * future, there may be match finders that support bigger
  214. * dictionaries.
  215. *
  216. * Decoder already supports dictionaries up to 4 GiB - 1 B (i.e.
  217. * UINT32_MAX), so increasing the maximum dictionary size of the
  218. * encoder won't cause problems for old decoders.
  219. *
  220. * Because extremely small dictionaries sizes would have unneeded
  221. * overhead in the decoder, the minimum dictionary size is 4096 bytes.
  222. *
  223. * \note When decoding, too big dictionary does no other harm
  224. * than wasting memory.
  225. */
  226. uint32_t dict_size;
  227. # define LZMA_DICT_SIZE_MIN UINT32_C(4096)
  228. # define LZMA_DICT_SIZE_DEFAULT (UINT32_C(1) << 23)
  229. /**
  230. * \brief Pointer to an initial dictionary
  231. *
  232. * It is possible to initialize the LZ77 history window using
  233. * a preset dictionary. It is useful when compressing many
  234. * similar, relatively small chunks of data independently from
  235. * each other. The preset dictionary should contain typical
  236. * strings that occur in the files being compressed. The most
  237. * probable strings should be near the end of the preset dictionary.
  238. *
  239. * This feature should be used only in special situations. For
  240. * now, it works correctly only with raw encoding and decoding.
  241. * Currently none of the container formats supported by
  242. * liblzma allow preset dictionary when decoding, thus if
  243. * you create a .xz or .lzma file with preset dictionary, it
  244. * cannot be decoded with the regular decoder functions. In the
  245. * future, the .xz format will likely get support for preset
  246. * dictionary though.
  247. */
  248. const uint8_t *preset_dict;
  249. /**
  250. * \brief Size of the preset dictionary
  251. *
  252. * Specifies the size of the preset dictionary. If the size is
  253. * bigger than dict_size, only the last dict_size bytes are
  254. * processed.
  255. *
  256. * This variable is read only when preset_dict is not NULL.
  257. * If preset_dict is not NULL but preset_dict_size is zero,
  258. * no preset dictionary is used (identical to only setting
  259. * preset_dict to NULL).
  260. */
  261. uint32_t preset_dict_size;
  262. /**
  263. * \brief Number of literal context bits
  264. *
  265. * How many of the highest bits of the previous uncompressed
  266. * eight-bit byte (also known as 'literal') are taken into
  267. * account when predicting the bits of the next literal.
  268. *
  269. * E.g. in typical English text, an upper-case letter is
  270. * often followed by a lower-case letter, and a lower-case
  271. * letter is usually followed by another lower-case letter.
  272. * In the US-ASCII character set, the highest three bits are 010
  273. * for upper-case letters and 011 for lower-case letters.
  274. * When lc is at least 3, the literal coding can take advantage of
  275. * this property in the uncompressed data.
  276. *
  277. * There is a limit that applies to literal context bits and literal
  278. * position bits together: lc + lp <= 4. Without this limit the
  279. * decoding could become very slow, which could have security related
  280. * results in some cases like email servers doing virus scanning.
  281. * This limit also simplifies the internal implementation in liblzma.
  282. *
  283. * There may be LZMA1 streams that have lc + lp > 4 (maximum possible
  284. * lc would be 8). It is not possible to decode such streams with
  285. * liblzma.
  286. */
  287. uint32_t lc;
  288. # define LZMA_LCLP_MIN 0
  289. # define LZMA_LCLP_MAX 4
  290. # define LZMA_LC_DEFAULT 3
  291. /**
  292. * \brief Number of literal position bits
  293. *
  294. * lp affects what kind of alignment in the uncompressed data is
  295. * assumed when encoding literals. A literal is a single 8-bit byte.
  296. * See pb below for more information about alignment.
  297. */
  298. uint32_t lp;
  299. # define LZMA_LP_DEFAULT 0
  300. /**
  301. * \brief Number of position bits
  302. *
  303. * pb affects what kind of alignment in the uncompressed data is
  304. * assumed in general. The default means four-byte alignment
  305. * (2^ pb =2^2=4), which is often a good choice when there's
  306. * no better guess.
  307. *
  308. * When the alignment is known, setting pb accordingly may reduce
  309. * the file size a little. E.g. with text files having one-byte
  310. * alignment (US-ASCII, ISO-8859-*, UTF-8), setting pb=0 can
  311. * improve compression slightly. For UTF-16 text, pb=1 is a good
  312. * choice. If the alignment is an odd number like 3 bytes, pb=0
  313. * might be the best choice.
  314. *
  315. * Even though the assumed alignment can be adjusted with pb and
  316. * lp, LZMA1 and LZMA2 still slightly favor 16-byte alignment.
  317. * It might be worth taking into account when designing file formats
  318. * that are likely to be often compressed with LZMA1 or LZMA2.
  319. */
  320. uint32_t pb;
  321. # define LZMA_PB_MIN 0
  322. # define LZMA_PB_MAX 4
  323. # define LZMA_PB_DEFAULT 2
  324. /** Compression mode */
  325. lzma_mode mode;
  326. /**
  327. * \brief Nice length of a match
  328. *
  329. * This determines how many bytes the encoder compares from the match
  330. * candidates when looking for the best match. Once a match of at
  331. * least nice_len bytes long is found, the encoder stops looking for
  332. * better candidates and encodes the match. (Naturally, if the found
  333. * match is actually longer than nice_len, the actual length is
  334. * encoded; it's not truncated to nice_len.)
  335. *
  336. * Bigger values usually increase the compression ratio and
  337. * compression time. For most files, 32 to 128 is a good value,
  338. * which gives very good compression ratio at good speed.
  339. *
  340. * The exact minimum value depends on the match finder. The maximum
  341. * is 273, which is the maximum length of a match that LZMA1 and
  342. * LZMA2 can encode.
  343. */
  344. uint32_t nice_len;
  345. /** Match finder ID */
  346. lzma_match_finder mf;
  347. /**
  348. * \brief Maximum search depth in the match finder
  349. *
  350. * For every input byte, match finder searches through the hash chain
  351. * or binary tree in a loop, each iteration going one step deeper in
  352. * the chain or tree. The searching stops if
  353. * - a match of at least nice_len bytes long is found;
  354. * - all match candidates from the hash chain or binary tree have
  355. * been checked; or
  356. * - maximum search depth is reached.
  357. *
  358. * Maximum search depth is needed to prevent the match finder from
  359. * wasting too much time in case there are lots of short match
  360. * candidates. On the other hand, stopping the search before all
  361. * candidates have been checked can reduce compression ratio.
  362. *
  363. * Setting depth to zero tells liblzma to use an automatic default
  364. * value, that depends on the selected match finder and nice_len.
  365. * The default is in the range [4, 200] or so (it may vary between
  366. * liblzma versions).
  367. *
  368. * Using a bigger depth value than the default can increase
  369. * compression ratio in some cases. There is no strict maximum value,
  370. * but high values (thousands or millions) should be used with care:
  371. * the encoder could remain fast enough with typical input, but
  372. * malicious input could cause the match finder to slow down
  373. * dramatically, possibly creating a denial of service attack.
  374. */
  375. uint32_t depth;
  376. /**
  377. * \brief For LZMA_FILTER_LZMA1EXT: Extended flags
  378. *
  379. * This is used only with LZMA_FILTER_LZMA1EXT.
  380. *
  381. * Currently only one flag is supported, LZMA_LZMA1EXT_ALLOW_EOPM:
  382. *
  383. * - Encoder: If the flag is set, then end marker is written just
  384. * like it is with LZMA_FILTER_LZMA1. Without this flag the
  385. * end marker isn't written and the application has to store
  386. * the uncompressed size somewhere outside the compressed stream.
  387. * To decompress streams without the end marker, the application
  388. * has to set the correct uncompressed size in ext_size_low and
  389. * ext_size_high.
  390. *
  391. * - Decoder: If the uncompressed size in ext_size_low and
  392. * ext_size_high is set to the special value UINT64_MAX
  393. * (indicating unknown uncompressed size) then this flag is
  394. * ignored and the end marker must always be present, that is,
  395. * the behavior is identical to LZMA_FILTER_LZMA1.
  396. *
  397. * Otherwise, if this flag isn't set, then the input stream
  398. * must not have the end marker; if the end marker is detected
  399. * then it will result in LZMA_DATA_ERROR. This is useful when
  400. * it is known that the stream must not have the end marker and
  401. * strict validation is wanted.
  402. *
  403. * If this flag is set, then it is autodetected if the end marker
  404. * is present after the specified number of uncompressed bytes
  405. * has been decompressed (ext_size_low and ext_size_high). The
  406. * end marker isn't allowed in any other position. This behavior
  407. * is useful when uncompressed size is known but the end marker
  408. * may or may not be present. This is the case, for example,
  409. * in .7z files (valid .7z files that have the end marker in
  410. * LZMA1 streams are rare but they do exist).
  411. */
  412. uint32_t ext_flags;
  413. # define LZMA_LZMA1EXT_ALLOW_EOPM UINT32_C(0x01)
  414. /**
  415. * \brief For LZMA_FILTER_LZMA1EXT: Uncompressed size (low bits)
  416. *
  417. * The 64-bit uncompressed size is needed for decompression with
  418. * LZMA_FILTER_LZMA1EXT. The size is ignored by the encoder.
  419. *
  420. * The special value UINT64_MAX indicates that the uncompressed size
  421. * is unknown and that the end of payload marker (also known as
  422. * end of stream marker) must be present to indicate the end of
  423. * the LZMA1 stream. Any other value indicates the expected
  424. * uncompressed size of the LZMA1 stream. (If LZMA1 was used together
  425. * with filters that change the size of the data then the uncompressed
  426. * size of the LZMA1 stream could be different than the final
  427. * uncompressed size of the filtered stream.)
  428. *
  429. * ext_size_low holds the least significant 32 bits of the
  430. * uncompressed size. The most significant 32 bits must be set
  431. * in ext_size_high. The macro lzma_ext_size_set(opt_lzma, u64size)
  432. * can be used to set these members.
  433. *
  434. * The 64-bit uncompressed size is split into two uint32_t variables
  435. * because there were no reserved uint64_t members and using the
  436. * same options structure for LZMA_FILTER_LZMA1, LZMA_FILTER_LZMA1EXT,
  437. * and LZMA_FILTER_LZMA2 was otherwise more convenient than having
  438. * a new options structure for LZMA_FILTER_LZMA1EXT. (Replacing two
  439. * uint32_t members with one uint64_t changes the ABI on some systems
  440. * as the alignment of this struct can increase from 4 bytes to 8.)
  441. */
  442. uint32_t ext_size_low;
  443. /**
  444. * \brief For LZMA_FILTER_LZMA1EXT: Uncompressed size (high bits)
  445. *
  446. * This holds the most significant 32 bits of the uncompressed size.
  447. */
  448. uint32_t ext_size_high;
  449. /*
  450. * Reserved space to allow possible future extensions without
  451. * breaking the ABI. You should not touch these, because the names
  452. * of these variables may change. These are and will never be used
  453. * with the currently supported options, so it is safe to leave these
  454. * uninitialized.
  455. */
  456. /** \private Reserved member. */
  457. uint32_t reserved_int4;
  458. /** \private Reserved member. */
  459. uint32_t reserved_int5;
  460. /** \private Reserved member. */
  461. uint32_t reserved_int6;
  462. /** \private Reserved member. */
  463. uint32_t reserved_int7;
  464. /** \private Reserved member. */
  465. uint32_t reserved_int8;
  466. /** \private Reserved member. */
  467. lzma_reserved_enum reserved_enum1;
  468. /** \private Reserved member. */
  469. lzma_reserved_enum reserved_enum2;
  470. /** \private Reserved member. */
  471. lzma_reserved_enum reserved_enum3;
  472. /** \private Reserved member. */
  473. lzma_reserved_enum reserved_enum4;
  474. /** \private Reserved member. */
  475. void *reserved_ptr1;
  476. /** \private Reserved member. */
  477. void *reserved_ptr2;
  478. } lzma_options_lzma;
  479. /**
  480. * \brief Macro to set the 64-bit uncompressed size in ext_size_*
  481. *
  482. * This might be convenient when decoding using LZMA_FILTER_LZMA1EXT.
  483. * This isn't used with LZMA_FILTER_LZMA1 or LZMA_FILTER_LZMA2.
  484. */
  485. #define lzma_set_ext_size(opt_lzma2, u64size) \
  486. do { \
  487. (opt_lzma2).ext_size_low = (uint32_t)(u64size); \
  488. (opt_lzma2).ext_size_high = (uint32_t)((uint64_t)(u64size) >> 32); \
  489. } while (0)
  490. /**
  491. * \brief Set a compression preset to lzma_options_lzma structure
  492. *
  493. * 0 is the fastest and 9 is the slowest. These match the switches -0 .. -9
  494. * of the xz command line tool. In addition, it is possible to bitwise-or
  495. * flags to the preset. Currently only LZMA_PRESET_EXTREME is supported.
  496. * The flags are defined in container.h, because the flags are used also
  497. * with lzma_easy_encoder().
  498. *
  499. * The preset levels are subject to changes between liblzma versions.
  500. *
  501. * This function is available only if LZMA1 or LZMA2 encoder has been enabled
  502. * when building liblzma.
  503. *
  504. * If features (like certain match finders) have been disabled at build time,
  505. * then the function may return success (false) even though the resulting
  506. * LZMA1/LZMA2 options may not be usable for encoder initialization
  507. * (LZMA_OPTIONS_ERROR).
  508. *
  509. * \param[out] options Pointer to LZMA1 or LZMA2 options to be filled
  510. * \param preset Preset level bitwse-ORed with preset flags
  511. *
  512. * \return lzma_bool:
  513. * - true if the preset is not supported (failure).
  514. * - false otherwise (success).
  515. */
  516. extern LZMA_API(lzma_bool) lzma_lzma_preset(
  517. lzma_options_lzma *options, uint32_t preset) lzma_nothrow;