cover.c 41 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246
  1. /*
  2. * Copyright (c) Yann Collet, Facebook, Inc.
  3. * All rights reserved.
  4. *
  5. * This source code is licensed under both the BSD-style license (found in the
  6. * LICENSE file in the root directory of this source tree) and the GPLv2 (found
  7. * in the COPYING file in the root directory of this source tree).
  8. * You may select, at your option, one of the above-listed licenses.
  9. */
  10. /* *****************************************************************************
  11. * Constructs a dictionary using a heuristic based on the following paper:
  12. *
  13. * Liao, Petri, Moffat, Wirth
  14. * Effective Construction of Relative Lempel-Ziv Dictionaries
  15. * Published in WWW 2016.
  16. *
  17. * Adapted from code originally written by @ot (Giuseppe Ottaviano).
  18. ******************************************************************************/
  19. /*-*************************************
  20. * Dependencies
  21. ***************************************/
  22. #include <stdio.h> /* fprintf */
  23. #include <stdlib.h> /* malloc, free, qsort */
  24. #include <string.h> /* memset */
  25. #include <time.h> /* clock */
  26. #ifndef ZDICT_STATIC_LINKING_ONLY
  27. # define ZDICT_STATIC_LINKING_ONLY
  28. #endif
  29. #include "../common/mem.h" /* read */
  30. #include "../common/pool.h"
  31. #include "../common/threading.h"
  32. #include "../common/zstd_internal.h" /* includes zstd.h */
  33. #include "../zdict.h"
  34. #include "cover.h"
  35. /*-*************************************
  36. * Constants
  37. ***************************************/
  38. #define COVER_MAX_SAMPLES_SIZE (sizeof(size_t) == 8 ? ((unsigned)-1) : ((unsigned)1 GB))
  39. #define COVER_DEFAULT_SPLITPOINT 1.0
  40. /*-*************************************
  41. * Console display
  42. ***************************************/
  43. #ifndef LOCALDISPLAYLEVEL
  44. static int g_displayLevel = 2;
  45. #endif
  46. #undef DISPLAY
  47. #define DISPLAY(...) \
  48. { \
  49. fprintf(stderr, __VA_ARGS__); \
  50. fflush(stderr); \
  51. }
  52. #undef LOCALDISPLAYLEVEL
  53. #define LOCALDISPLAYLEVEL(displayLevel, l, ...) \
  54. if (displayLevel >= l) { \
  55. DISPLAY(__VA_ARGS__); \
  56. } /* 0 : no display; 1: errors; 2: default; 3: details; 4: debug */
  57. #undef DISPLAYLEVEL
  58. #define DISPLAYLEVEL(l, ...) LOCALDISPLAYLEVEL(g_displayLevel, l, __VA_ARGS__)
  59. #ifndef LOCALDISPLAYUPDATE
  60. static const clock_t g_refreshRate = CLOCKS_PER_SEC * 15 / 100;
  61. static clock_t g_time = 0;
  62. #endif
  63. #undef LOCALDISPLAYUPDATE
  64. #define LOCALDISPLAYUPDATE(displayLevel, l, ...) \
  65. if (displayLevel >= l) { \
  66. if ((clock() - g_time > g_refreshRate) || (displayLevel >= 4)) { \
  67. g_time = clock(); \
  68. DISPLAY(__VA_ARGS__); \
  69. } \
  70. }
  71. #undef DISPLAYUPDATE
  72. #define DISPLAYUPDATE(l, ...) LOCALDISPLAYUPDATE(g_displayLevel, l, __VA_ARGS__)
  73. /*-*************************************
  74. * Hash table
  75. ***************************************
  76. * A small specialized hash map for storing activeDmers.
  77. * The map does not resize, so if it becomes full it will loop forever.
  78. * Thus, the map must be large enough to store every value.
  79. * The map implements linear probing and keeps its load less than 0.5.
  80. */
  81. #define MAP_EMPTY_VALUE ((U32)-1)
  82. typedef struct COVER_map_pair_t_s {
  83. U32 key;
  84. U32 value;
  85. } COVER_map_pair_t;
  86. typedef struct COVER_map_s {
  87. COVER_map_pair_t *data;
  88. U32 sizeLog;
  89. U32 size;
  90. U32 sizeMask;
  91. } COVER_map_t;
  92. /**
  93. * Clear the map.
  94. */
  95. static void COVER_map_clear(COVER_map_t *map) {
  96. memset(map->data, MAP_EMPTY_VALUE, map->size * sizeof(COVER_map_pair_t));
  97. }
  98. /**
  99. * Initializes a map of the given size.
  100. * Returns 1 on success and 0 on failure.
  101. * The map must be destroyed with COVER_map_destroy().
  102. * The map is only guaranteed to be large enough to hold size elements.
  103. */
  104. static int COVER_map_init(COVER_map_t *map, U32 size) {
  105. map->sizeLog = ZSTD_highbit32(size) + 2;
  106. map->size = (U32)1 << map->sizeLog;
  107. map->sizeMask = map->size - 1;
  108. map->data = (COVER_map_pair_t *)malloc(map->size * sizeof(COVER_map_pair_t));
  109. if (!map->data) {
  110. map->sizeLog = 0;
  111. map->size = 0;
  112. return 0;
  113. }
  114. COVER_map_clear(map);
  115. return 1;
  116. }
  117. /**
  118. * Internal hash function
  119. */
  120. static const U32 COVER_prime4bytes = 2654435761U;
  121. static U32 COVER_map_hash(COVER_map_t *map, U32 key) {
  122. return (key * COVER_prime4bytes) >> (32 - map->sizeLog);
  123. }
  124. /**
  125. * Helper function that returns the index that a key should be placed into.
  126. */
  127. static U32 COVER_map_index(COVER_map_t *map, U32 key) {
  128. const U32 hash = COVER_map_hash(map, key);
  129. U32 i;
  130. for (i = hash;; i = (i + 1) & map->sizeMask) {
  131. COVER_map_pair_t *pos = &map->data[i];
  132. if (pos->value == MAP_EMPTY_VALUE) {
  133. return i;
  134. }
  135. if (pos->key == key) {
  136. return i;
  137. }
  138. }
  139. }
  140. /**
  141. * Returns the pointer to the value for key.
  142. * If key is not in the map, it is inserted and the value is set to 0.
  143. * The map must not be full.
  144. */
  145. static U32 *COVER_map_at(COVER_map_t *map, U32 key) {
  146. COVER_map_pair_t *pos = &map->data[COVER_map_index(map, key)];
  147. if (pos->value == MAP_EMPTY_VALUE) {
  148. pos->key = key;
  149. pos->value = 0;
  150. }
  151. return &pos->value;
  152. }
  153. /**
  154. * Deletes key from the map if present.
  155. */
  156. static void COVER_map_remove(COVER_map_t *map, U32 key) {
  157. U32 i = COVER_map_index(map, key);
  158. COVER_map_pair_t *del = &map->data[i];
  159. U32 shift = 1;
  160. if (del->value == MAP_EMPTY_VALUE) {
  161. return;
  162. }
  163. for (i = (i + 1) & map->sizeMask;; i = (i + 1) & map->sizeMask) {
  164. COVER_map_pair_t *const pos = &map->data[i];
  165. /* If the position is empty we are done */
  166. if (pos->value == MAP_EMPTY_VALUE) {
  167. del->value = MAP_EMPTY_VALUE;
  168. return;
  169. }
  170. /* If pos can be moved to del do so */
  171. if (((i - COVER_map_hash(map, pos->key)) & map->sizeMask) >= shift) {
  172. del->key = pos->key;
  173. del->value = pos->value;
  174. del = pos;
  175. shift = 1;
  176. } else {
  177. ++shift;
  178. }
  179. }
  180. }
  181. /**
  182. * Destroys a map that is inited with COVER_map_init().
  183. */
  184. static void COVER_map_destroy(COVER_map_t *map) {
  185. if (map->data) {
  186. free(map->data);
  187. }
  188. map->data = NULL;
  189. map->size = 0;
  190. }
  191. /*-*************************************
  192. * Context
  193. ***************************************/
  194. typedef struct {
  195. const BYTE *samples;
  196. size_t *offsets;
  197. const size_t *samplesSizes;
  198. size_t nbSamples;
  199. size_t nbTrainSamples;
  200. size_t nbTestSamples;
  201. U32 *suffix;
  202. size_t suffixSize;
  203. U32 *freqs;
  204. U32 *dmerAt;
  205. unsigned d;
  206. } COVER_ctx_t;
  207. /* We need a global context for qsort... */
  208. static COVER_ctx_t *g_coverCtx = NULL;
  209. /*-*************************************
  210. * Helper functions
  211. ***************************************/
  212. /**
  213. * Returns the sum of the sample sizes.
  214. */
  215. size_t COVER_sum(const size_t *samplesSizes, unsigned nbSamples) {
  216. size_t sum = 0;
  217. unsigned i;
  218. for (i = 0; i < nbSamples; ++i) {
  219. sum += samplesSizes[i];
  220. }
  221. return sum;
  222. }
  223. /**
  224. * Returns -1 if the dmer at lp is less than the dmer at rp.
  225. * Return 0 if the dmers at lp and rp are equal.
  226. * Returns 1 if the dmer at lp is greater than the dmer at rp.
  227. */
  228. static int COVER_cmp(COVER_ctx_t *ctx, const void *lp, const void *rp) {
  229. U32 const lhs = *(U32 const *)lp;
  230. U32 const rhs = *(U32 const *)rp;
  231. return memcmp(ctx->samples + lhs, ctx->samples + rhs, ctx->d);
  232. }
  233. /**
  234. * Faster version for d <= 8.
  235. */
  236. static int COVER_cmp8(COVER_ctx_t *ctx, const void *lp, const void *rp) {
  237. U64 const mask = (ctx->d == 8) ? (U64)-1 : (((U64)1 << (8 * ctx->d)) - 1);
  238. U64 const lhs = MEM_readLE64(ctx->samples + *(U32 const *)lp) & mask;
  239. U64 const rhs = MEM_readLE64(ctx->samples + *(U32 const *)rp) & mask;
  240. if (lhs < rhs) {
  241. return -1;
  242. }
  243. return (lhs > rhs);
  244. }
  245. /**
  246. * Same as COVER_cmp() except ties are broken by pointer value
  247. * NOTE: g_coverCtx must be set to call this function. A global is required because
  248. * qsort doesn't take an opaque pointer.
  249. */
  250. static int WIN_CDECL COVER_strict_cmp(const void *lp, const void *rp) {
  251. int result = COVER_cmp(g_coverCtx, lp, rp);
  252. if (result == 0) {
  253. result = lp < rp ? -1 : 1;
  254. }
  255. return result;
  256. }
  257. /**
  258. * Faster version for d <= 8.
  259. */
  260. static int WIN_CDECL COVER_strict_cmp8(const void *lp, const void *rp) {
  261. int result = COVER_cmp8(g_coverCtx, lp, rp);
  262. if (result == 0) {
  263. result = lp < rp ? -1 : 1;
  264. }
  265. return result;
  266. }
  267. /**
  268. * Returns the first pointer in [first, last) whose element does not compare
  269. * less than value. If no such element exists it returns last.
  270. */
  271. static const size_t *COVER_lower_bound(const size_t *first, const size_t *last,
  272. size_t value) {
  273. size_t count = last - first;
  274. while (count != 0) {
  275. size_t step = count / 2;
  276. const size_t *ptr = first;
  277. ptr += step;
  278. if (*ptr < value) {
  279. first = ++ptr;
  280. count -= step + 1;
  281. } else {
  282. count = step;
  283. }
  284. }
  285. return first;
  286. }
  287. /**
  288. * Generic groupBy function.
  289. * Groups an array sorted by cmp into groups with equivalent values.
  290. * Calls grp for each group.
  291. */
  292. static void
  293. COVER_groupBy(const void *data, size_t count, size_t size, COVER_ctx_t *ctx,
  294. int (*cmp)(COVER_ctx_t *, const void *, const void *),
  295. void (*grp)(COVER_ctx_t *, const void *, const void *)) {
  296. const BYTE *ptr = (const BYTE *)data;
  297. size_t num = 0;
  298. while (num < count) {
  299. const BYTE *grpEnd = ptr + size;
  300. ++num;
  301. while (num < count && cmp(ctx, ptr, grpEnd) == 0) {
  302. grpEnd += size;
  303. ++num;
  304. }
  305. grp(ctx, ptr, grpEnd);
  306. ptr = grpEnd;
  307. }
  308. }
  309. /*-*************************************
  310. * Cover functions
  311. ***************************************/
  312. /**
  313. * Called on each group of positions with the same dmer.
  314. * Counts the frequency of each dmer and saves it in the suffix array.
  315. * Fills `ctx->dmerAt`.
  316. */
  317. static void COVER_group(COVER_ctx_t *ctx, const void *group,
  318. const void *groupEnd) {
  319. /* The group consists of all the positions with the same first d bytes. */
  320. const U32 *grpPtr = (const U32 *)group;
  321. const U32 *grpEnd = (const U32 *)groupEnd;
  322. /* The dmerId is how we will reference this dmer.
  323. * This allows us to map the whole dmer space to a much smaller space, the
  324. * size of the suffix array.
  325. */
  326. const U32 dmerId = (U32)(grpPtr - ctx->suffix);
  327. /* Count the number of samples this dmer shows up in */
  328. U32 freq = 0;
  329. /* Details */
  330. const size_t *curOffsetPtr = ctx->offsets;
  331. const size_t *offsetsEnd = ctx->offsets + ctx->nbSamples;
  332. /* Once *grpPtr >= curSampleEnd this occurrence of the dmer is in a
  333. * different sample than the last.
  334. */
  335. size_t curSampleEnd = ctx->offsets[0];
  336. for (; grpPtr != grpEnd; ++grpPtr) {
  337. /* Save the dmerId for this position so we can get back to it. */
  338. ctx->dmerAt[*grpPtr] = dmerId;
  339. /* Dictionaries only help for the first reference to the dmer.
  340. * After that zstd can reference the match from the previous reference.
  341. * So only count each dmer once for each sample it is in.
  342. */
  343. if (*grpPtr < curSampleEnd) {
  344. continue;
  345. }
  346. freq += 1;
  347. /* Binary search to find the end of the sample *grpPtr is in.
  348. * In the common case that grpPtr + 1 == grpEnd we can skip the binary
  349. * search because the loop is over.
  350. */
  351. if (grpPtr + 1 != grpEnd) {
  352. const size_t *sampleEndPtr =
  353. COVER_lower_bound(curOffsetPtr, offsetsEnd, *grpPtr);
  354. curSampleEnd = *sampleEndPtr;
  355. curOffsetPtr = sampleEndPtr + 1;
  356. }
  357. }
  358. /* At this point we are never going to look at this segment of the suffix
  359. * array again. We take advantage of this fact to save memory.
  360. * We store the frequency of the dmer in the first position of the group,
  361. * which is dmerId.
  362. */
  363. ctx->suffix[dmerId] = freq;
  364. }
  365. /**
  366. * Selects the best segment in an epoch.
  367. * Segments of are scored according to the function:
  368. *
  369. * Let F(d) be the frequency of dmer d.
  370. * Let S_i be the dmer at position i of segment S which has length k.
  371. *
  372. * Score(S) = F(S_1) + F(S_2) + ... + F(S_{k-d+1})
  373. *
  374. * Once the dmer d is in the dictionary we set F(d) = 0.
  375. */
  376. static COVER_segment_t COVER_selectSegment(const COVER_ctx_t *ctx, U32 *freqs,
  377. COVER_map_t *activeDmers, U32 begin,
  378. U32 end,
  379. ZDICT_cover_params_t parameters) {
  380. /* Constants */
  381. const U32 k = parameters.k;
  382. const U32 d = parameters.d;
  383. const U32 dmersInK = k - d + 1;
  384. /* Try each segment (activeSegment) and save the best (bestSegment) */
  385. COVER_segment_t bestSegment = {0, 0, 0};
  386. COVER_segment_t activeSegment;
  387. /* Reset the activeDmers in the segment */
  388. COVER_map_clear(activeDmers);
  389. /* The activeSegment starts at the beginning of the epoch. */
  390. activeSegment.begin = begin;
  391. activeSegment.end = begin;
  392. activeSegment.score = 0;
  393. /* Slide the activeSegment through the whole epoch.
  394. * Save the best segment in bestSegment.
  395. */
  396. while (activeSegment.end < end) {
  397. /* The dmerId for the dmer at the next position */
  398. U32 newDmer = ctx->dmerAt[activeSegment.end];
  399. /* The entry in activeDmers for this dmerId */
  400. U32 *newDmerOcc = COVER_map_at(activeDmers, newDmer);
  401. /* If the dmer isn't already present in the segment add its score. */
  402. if (*newDmerOcc == 0) {
  403. /* The paper suggest using the L-0.5 norm, but experiments show that it
  404. * doesn't help.
  405. */
  406. activeSegment.score += freqs[newDmer];
  407. }
  408. /* Add the dmer to the segment */
  409. activeSegment.end += 1;
  410. *newDmerOcc += 1;
  411. /* If the window is now too large, drop the first position */
  412. if (activeSegment.end - activeSegment.begin == dmersInK + 1) {
  413. U32 delDmer = ctx->dmerAt[activeSegment.begin];
  414. U32 *delDmerOcc = COVER_map_at(activeDmers, delDmer);
  415. activeSegment.begin += 1;
  416. *delDmerOcc -= 1;
  417. /* If this is the last occurrence of the dmer, subtract its score */
  418. if (*delDmerOcc == 0) {
  419. COVER_map_remove(activeDmers, delDmer);
  420. activeSegment.score -= freqs[delDmer];
  421. }
  422. }
  423. /* If this segment is the best so far save it */
  424. if (activeSegment.score > bestSegment.score) {
  425. bestSegment = activeSegment;
  426. }
  427. }
  428. {
  429. /* Trim off the zero frequency head and tail from the segment. */
  430. U32 newBegin = bestSegment.end;
  431. U32 newEnd = bestSegment.begin;
  432. U32 pos;
  433. for (pos = bestSegment.begin; pos != bestSegment.end; ++pos) {
  434. U32 freq = freqs[ctx->dmerAt[pos]];
  435. if (freq != 0) {
  436. newBegin = MIN(newBegin, pos);
  437. newEnd = pos + 1;
  438. }
  439. }
  440. bestSegment.begin = newBegin;
  441. bestSegment.end = newEnd;
  442. }
  443. {
  444. /* Zero out the frequency of each dmer covered by the chosen segment. */
  445. U32 pos;
  446. for (pos = bestSegment.begin; pos != bestSegment.end; ++pos) {
  447. freqs[ctx->dmerAt[pos]] = 0;
  448. }
  449. }
  450. return bestSegment;
  451. }
  452. /**
  453. * Check the validity of the parameters.
  454. * Returns non-zero if the parameters are valid and 0 otherwise.
  455. */
  456. static int COVER_checkParameters(ZDICT_cover_params_t parameters,
  457. size_t maxDictSize) {
  458. /* k and d are required parameters */
  459. if (parameters.d == 0 || parameters.k == 0) {
  460. return 0;
  461. }
  462. /* k <= maxDictSize */
  463. if (parameters.k > maxDictSize) {
  464. return 0;
  465. }
  466. /* d <= k */
  467. if (parameters.d > parameters.k) {
  468. return 0;
  469. }
  470. /* 0 < splitPoint <= 1 */
  471. if (parameters.splitPoint <= 0 || parameters.splitPoint > 1){
  472. return 0;
  473. }
  474. return 1;
  475. }
  476. /**
  477. * Clean up a context initialized with `COVER_ctx_init()`.
  478. */
  479. static void COVER_ctx_destroy(COVER_ctx_t *ctx) {
  480. if (!ctx) {
  481. return;
  482. }
  483. if (ctx->suffix) {
  484. free(ctx->suffix);
  485. ctx->suffix = NULL;
  486. }
  487. if (ctx->freqs) {
  488. free(ctx->freqs);
  489. ctx->freqs = NULL;
  490. }
  491. if (ctx->dmerAt) {
  492. free(ctx->dmerAt);
  493. ctx->dmerAt = NULL;
  494. }
  495. if (ctx->offsets) {
  496. free(ctx->offsets);
  497. ctx->offsets = NULL;
  498. }
  499. }
  500. /**
  501. * Prepare a context for dictionary building.
  502. * The context is only dependent on the parameter `d` and can used multiple
  503. * times.
  504. * Returns 0 on success or error code on error.
  505. * The context must be destroyed with `COVER_ctx_destroy()`.
  506. */
  507. static size_t COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer,
  508. const size_t *samplesSizes, unsigned nbSamples,
  509. unsigned d, double splitPoint) {
  510. const BYTE *const samples = (const BYTE *)samplesBuffer;
  511. const size_t totalSamplesSize = COVER_sum(samplesSizes, nbSamples);
  512. /* Split samples into testing and training sets */
  513. const unsigned nbTrainSamples = splitPoint < 1.0 ? (unsigned)((double)nbSamples * splitPoint) : nbSamples;
  514. const unsigned nbTestSamples = splitPoint < 1.0 ? nbSamples - nbTrainSamples : nbSamples;
  515. const size_t trainingSamplesSize = splitPoint < 1.0 ? COVER_sum(samplesSizes, nbTrainSamples) : totalSamplesSize;
  516. const size_t testSamplesSize = splitPoint < 1.0 ? COVER_sum(samplesSizes + nbTrainSamples, nbTestSamples) : totalSamplesSize;
  517. /* Checks */
  518. if (totalSamplesSize < MAX(d, sizeof(U64)) ||
  519. totalSamplesSize >= (size_t)COVER_MAX_SAMPLES_SIZE) {
  520. DISPLAYLEVEL(1, "Total samples size is too large (%u MB), maximum size is %u MB\n",
  521. (unsigned)(totalSamplesSize>>20), (COVER_MAX_SAMPLES_SIZE >> 20));
  522. return ERROR(srcSize_wrong);
  523. }
  524. /* Check if there are at least 5 training samples */
  525. if (nbTrainSamples < 5) {
  526. DISPLAYLEVEL(1, "Total number of training samples is %u and is invalid.", nbTrainSamples);
  527. return ERROR(srcSize_wrong);
  528. }
  529. /* Check if there's testing sample */
  530. if (nbTestSamples < 1) {
  531. DISPLAYLEVEL(1, "Total number of testing samples is %u and is invalid.", nbTestSamples);
  532. return ERROR(srcSize_wrong);
  533. }
  534. /* Zero the context */
  535. memset(ctx, 0, sizeof(*ctx));
  536. DISPLAYLEVEL(2, "Training on %u samples of total size %u\n", nbTrainSamples,
  537. (unsigned)trainingSamplesSize);
  538. DISPLAYLEVEL(2, "Testing on %u samples of total size %u\n", nbTestSamples,
  539. (unsigned)testSamplesSize);
  540. ctx->samples = samples;
  541. ctx->samplesSizes = samplesSizes;
  542. ctx->nbSamples = nbSamples;
  543. ctx->nbTrainSamples = nbTrainSamples;
  544. ctx->nbTestSamples = nbTestSamples;
  545. /* Partial suffix array */
  546. ctx->suffixSize = trainingSamplesSize - MAX(d, sizeof(U64)) + 1;
  547. ctx->suffix = (U32 *)malloc(ctx->suffixSize * sizeof(U32));
  548. /* Maps index to the dmerID */
  549. ctx->dmerAt = (U32 *)malloc(ctx->suffixSize * sizeof(U32));
  550. /* The offsets of each file */
  551. ctx->offsets = (size_t *)malloc((nbSamples + 1) * sizeof(size_t));
  552. if (!ctx->suffix || !ctx->dmerAt || !ctx->offsets) {
  553. DISPLAYLEVEL(1, "Failed to allocate scratch buffers\n");
  554. COVER_ctx_destroy(ctx);
  555. return ERROR(memory_allocation);
  556. }
  557. ctx->freqs = NULL;
  558. ctx->d = d;
  559. /* Fill offsets from the samplesSizes */
  560. {
  561. U32 i;
  562. ctx->offsets[0] = 0;
  563. for (i = 1; i <= nbSamples; ++i) {
  564. ctx->offsets[i] = ctx->offsets[i - 1] + samplesSizes[i - 1];
  565. }
  566. }
  567. DISPLAYLEVEL(2, "Constructing partial suffix array\n");
  568. {
  569. /* suffix is a partial suffix array.
  570. * It only sorts suffixes by their first parameters.d bytes.
  571. * The sort is stable, so each dmer group is sorted by position in input.
  572. */
  573. U32 i;
  574. for (i = 0; i < ctx->suffixSize; ++i) {
  575. ctx->suffix[i] = i;
  576. }
  577. /* qsort doesn't take an opaque pointer, so pass as a global.
  578. * On OpenBSD qsort() is not guaranteed to be stable, their mergesort() is.
  579. */
  580. g_coverCtx = ctx;
  581. #if defined(__OpenBSD__)
  582. mergesort(ctx->suffix, ctx->suffixSize, sizeof(U32),
  583. (ctx->d <= 8 ? &COVER_strict_cmp8 : &COVER_strict_cmp));
  584. #else
  585. qsort(ctx->suffix, ctx->suffixSize, sizeof(U32),
  586. (ctx->d <= 8 ? &COVER_strict_cmp8 : &COVER_strict_cmp));
  587. #endif
  588. }
  589. DISPLAYLEVEL(2, "Computing frequencies\n");
  590. /* For each dmer group (group of positions with the same first d bytes):
  591. * 1. For each position we set dmerAt[position] = dmerID. The dmerID is
  592. * (groupBeginPtr - suffix). This allows us to go from position to
  593. * dmerID so we can look up values in freq.
  594. * 2. We calculate how many samples the dmer occurs in and save it in
  595. * freqs[dmerId].
  596. */
  597. COVER_groupBy(ctx->suffix, ctx->suffixSize, sizeof(U32), ctx,
  598. (ctx->d <= 8 ? &COVER_cmp8 : &COVER_cmp), &COVER_group);
  599. ctx->freqs = ctx->suffix;
  600. ctx->suffix = NULL;
  601. return 0;
  602. }
  603. void COVER_warnOnSmallCorpus(size_t maxDictSize, size_t nbDmers, int displayLevel)
  604. {
  605. const double ratio = (double)nbDmers / maxDictSize;
  606. if (ratio >= 10) {
  607. return;
  608. }
  609. LOCALDISPLAYLEVEL(displayLevel, 1,
  610. "WARNING: The maximum dictionary size %u is too large "
  611. "compared to the source size %u! "
  612. "size(source)/size(dictionary) = %f, but it should be >= "
  613. "10! This may lead to a subpar dictionary! We recommend "
  614. "training on sources at least 10x, and preferably 100x "
  615. "the size of the dictionary! \n", (U32)maxDictSize,
  616. (U32)nbDmers, ratio);
  617. }
  618. COVER_epoch_info_t COVER_computeEpochs(U32 maxDictSize,
  619. U32 nbDmers, U32 k, U32 passes)
  620. {
  621. const U32 minEpochSize = k * 10;
  622. COVER_epoch_info_t epochs;
  623. epochs.num = MAX(1, maxDictSize / k / passes);
  624. epochs.size = nbDmers / epochs.num;
  625. if (epochs.size >= minEpochSize) {
  626. assert(epochs.size * epochs.num <= nbDmers);
  627. return epochs;
  628. }
  629. epochs.size = MIN(minEpochSize, nbDmers);
  630. epochs.num = nbDmers / epochs.size;
  631. assert(epochs.size * epochs.num <= nbDmers);
  632. return epochs;
  633. }
  634. /**
  635. * Given the prepared context build the dictionary.
  636. */
  637. static size_t COVER_buildDictionary(const COVER_ctx_t *ctx, U32 *freqs,
  638. COVER_map_t *activeDmers, void *dictBuffer,
  639. size_t dictBufferCapacity,
  640. ZDICT_cover_params_t parameters) {
  641. BYTE *const dict = (BYTE *)dictBuffer;
  642. size_t tail = dictBufferCapacity;
  643. /* Divide the data into epochs. We will select one segment from each epoch. */
  644. const COVER_epoch_info_t epochs = COVER_computeEpochs(
  645. (U32)dictBufferCapacity, (U32)ctx->suffixSize, parameters.k, 4);
  646. const size_t maxZeroScoreRun = MAX(10, MIN(100, epochs.num >> 3));
  647. size_t zeroScoreRun = 0;
  648. size_t epoch;
  649. DISPLAYLEVEL(2, "Breaking content into %u epochs of size %u\n",
  650. (U32)epochs.num, (U32)epochs.size);
  651. /* Loop through the epochs until there are no more segments or the dictionary
  652. * is full.
  653. */
  654. for (epoch = 0; tail > 0; epoch = (epoch + 1) % epochs.num) {
  655. const U32 epochBegin = (U32)(epoch * epochs.size);
  656. const U32 epochEnd = epochBegin + epochs.size;
  657. size_t segmentSize;
  658. /* Select a segment */
  659. COVER_segment_t segment = COVER_selectSegment(
  660. ctx, freqs, activeDmers, epochBegin, epochEnd, parameters);
  661. /* If the segment covers no dmers, then we are out of content.
  662. * There may be new content in other epochs, for continue for some time.
  663. */
  664. if (segment.score == 0) {
  665. if (++zeroScoreRun >= maxZeroScoreRun) {
  666. break;
  667. }
  668. continue;
  669. }
  670. zeroScoreRun = 0;
  671. /* Trim the segment if necessary and if it is too small then we are done */
  672. segmentSize = MIN(segment.end - segment.begin + parameters.d - 1, tail);
  673. if (segmentSize < parameters.d) {
  674. break;
  675. }
  676. /* We fill the dictionary from the back to allow the best segments to be
  677. * referenced with the smallest offsets.
  678. */
  679. tail -= segmentSize;
  680. memcpy(dict + tail, ctx->samples + segment.begin, segmentSize);
  681. DISPLAYUPDATE(
  682. 2, "\r%u%% ",
  683. (unsigned)(((dictBufferCapacity - tail) * 100) / dictBufferCapacity));
  684. }
  685. DISPLAYLEVEL(2, "\r%79s\r", "");
  686. return tail;
  687. }
  688. ZDICTLIB_API size_t ZDICT_trainFromBuffer_cover(
  689. void *dictBuffer, size_t dictBufferCapacity,
  690. const void *samplesBuffer, const size_t *samplesSizes, unsigned nbSamples,
  691. ZDICT_cover_params_t parameters)
  692. {
  693. BYTE* const dict = (BYTE*)dictBuffer;
  694. COVER_ctx_t ctx;
  695. COVER_map_t activeDmers;
  696. parameters.splitPoint = 1.0;
  697. /* Initialize global data */
  698. g_displayLevel = parameters.zParams.notificationLevel;
  699. /* Checks */
  700. if (!COVER_checkParameters(parameters, dictBufferCapacity)) {
  701. DISPLAYLEVEL(1, "Cover parameters incorrect\n");
  702. return ERROR(parameter_outOfBound);
  703. }
  704. if (nbSamples == 0) {
  705. DISPLAYLEVEL(1, "Cover must have at least one input file\n");
  706. return ERROR(srcSize_wrong);
  707. }
  708. if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) {
  709. DISPLAYLEVEL(1, "dictBufferCapacity must be at least %u\n",
  710. ZDICT_DICTSIZE_MIN);
  711. return ERROR(dstSize_tooSmall);
  712. }
  713. /* Initialize context and activeDmers */
  714. {
  715. size_t const initVal = COVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples,
  716. parameters.d, parameters.splitPoint);
  717. if (ZSTD_isError(initVal)) {
  718. return initVal;
  719. }
  720. }
  721. COVER_warnOnSmallCorpus(dictBufferCapacity, ctx.suffixSize, g_displayLevel);
  722. if (!COVER_map_init(&activeDmers, parameters.k - parameters.d + 1)) {
  723. DISPLAYLEVEL(1, "Failed to allocate dmer map: out of memory\n");
  724. COVER_ctx_destroy(&ctx);
  725. return ERROR(memory_allocation);
  726. }
  727. DISPLAYLEVEL(2, "Building dictionary\n");
  728. {
  729. const size_t tail =
  730. COVER_buildDictionary(&ctx, ctx.freqs, &activeDmers, dictBuffer,
  731. dictBufferCapacity, parameters);
  732. const size_t dictionarySize = ZDICT_finalizeDictionary(
  733. dict, dictBufferCapacity, dict + tail, dictBufferCapacity - tail,
  734. samplesBuffer, samplesSizes, nbSamples, parameters.zParams);
  735. if (!ZSTD_isError(dictionarySize)) {
  736. DISPLAYLEVEL(2, "Constructed dictionary of size %u\n",
  737. (unsigned)dictionarySize);
  738. }
  739. COVER_ctx_destroy(&ctx);
  740. COVER_map_destroy(&activeDmers);
  741. return dictionarySize;
  742. }
  743. }
  744. size_t COVER_checkTotalCompressedSize(const ZDICT_cover_params_t parameters,
  745. const size_t *samplesSizes, const BYTE *samples,
  746. size_t *offsets,
  747. size_t nbTrainSamples, size_t nbSamples,
  748. BYTE *const dict, size_t dictBufferCapacity) {
  749. size_t totalCompressedSize = ERROR(GENERIC);
  750. /* Pointers */
  751. ZSTD_CCtx *cctx;
  752. ZSTD_CDict *cdict;
  753. void *dst;
  754. /* Local variables */
  755. size_t dstCapacity;
  756. size_t i;
  757. /* Allocate dst with enough space to compress the maximum sized sample */
  758. {
  759. size_t maxSampleSize = 0;
  760. i = parameters.splitPoint < 1.0 ? nbTrainSamples : 0;
  761. for (; i < nbSamples; ++i) {
  762. maxSampleSize = MAX(samplesSizes[i], maxSampleSize);
  763. }
  764. dstCapacity = ZSTD_compressBound(maxSampleSize);
  765. dst = malloc(dstCapacity);
  766. }
  767. /* Create the cctx and cdict */
  768. cctx = ZSTD_createCCtx();
  769. cdict = ZSTD_createCDict(dict, dictBufferCapacity,
  770. parameters.zParams.compressionLevel);
  771. if (!dst || !cctx || !cdict) {
  772. goto _compressCleanup;
  773. }
  774. /* Compress each sample and sum their sizes (or error) */
  775. totalCompressedSize = dictBufferCapacity;
  776. i = parameters.splitPoint < 1.0 ? nbTrainSamples : 0;
  777. for (; i < nbSamples; ++i) {
  778. const size_t size = ZSTD_compress_usingCDict(
  779. cctx, dst, dstCapacity, samples + offsets[i],
  780. samplesSizes[i], cdict);
  781. if (ZSTD_isError(size)) {
  782. totalCompressedSize = size;
  783. goto _compressCleanup;
  784. }
  785. totalCompressedSize += size;
  786. }
  787. _compressCleanup:
  788. ZSTD_freeCCtx(cctx);
  789. ZSTD_freeCDict(cdict);
  790. if (dst) {
  791. free(dst);
  792. }
  793. return totalCompressedSize;
  794. }
  795. /**
  796. * Initialize the `COVER_best_t`.
  797. */
  798. void COVER_best_init(COVER_best_t *best) {
  799. if (best==NULL) return; /* compatible with init on NULL */
  800. (void)ZSTD_pthread_mutex_init(&best->mutex, NULL);
  801. (void)ZSTD_pthread_cond_init(&best->cond, NULL);
  802. best->liveJobs = 0;
  803. best->dict = NULL;
  804. best->dictSize = 0;
  805. best->compressedSize = (size_t)-1;
  806. memset(&best->parameters, 0, sizeof(best->parameters));
  807. }
  808. /**
  809. * Wait until liveJobs == 0.
  810. */
  811. void COVER_best_wait(COVER_best_t *best) {
  812. if (!best) {
  813. return;
  814. }
  815. ZSTD_pthread_mutex_lock(&best->mutex);
  816. while (best->liveJobs != 0) {
  817. ZSTD_pthread_cond_wait(&best->cond, &best->mutex);
  818. }
  819. ZSTD_pthread_mutex_unlock(&best->mutex);
  820. }
  821. /**
  822. * Call COVER_best_wait() and then destroy the COVER_best_t.
  823. */
  824. void COVER_best_destroy(COVER_best_t *best) {
  825. if (!best) {
  826. return;
  827. }
  828. COVER_best_wait(best);
  829. if (best->dict) {
  830. free(best->dict);
  831. }
  832. ZSTD_pthread_mutex_destroy(&best->mutex);
  833. ZSTD_pthread_cond_destroy(&best->cond);
  834. }
  835. /**
  836. * Called when a thread is about to be launched.
  837. * Increments liveJobs.
  838. */
  839. void COVER_best_start(COVER_best_t *best) {
  840. if (!best) {
  841. return;
  842. }
  843. ZSTD_pthread_mutex_lock(&best->mutex);
  844. ++best->liveJobs;
  845. ZSTD_pthread_mutex_unlock(&best->mutex);
  846. }
  847. /**
  848. * Called when a thread finishes executing, both on error or success.
  849. * Decrements liveJobs and signals any waiting threads if liveJobs == 0.
  850. * If this dictionary is the best so far save it and its parameters.
  851. */
  852. void COVER_best_finish(COVER_best_t *best, ZDICT_cover_params_t parameters,
  853. COVER_dictSelection_t selection) {
  854. void* dict = selection.dictContent;
  855. size_t compressedSize = selection.totalCompressedSize;
  856. size_t dictSize = selection.dictSize;
  857. if (!best) {
  858. return;
  859. }
  860. {
  861. size_t liveJobs;
  862. ZSTD_pthread_mutex_lock(&best->mutex);
  863. --best->liveJobs;
  864. liveJobs = best->liveJobs;
  865. /* If the new dictionary is better */
  866. if (compressedSize < best->compressedSize) {
  867. /* Allocate space if necessary */
  868. if (!best->dict || best->dictSize < dictSize) {
  869. if (best->dict) {
  870. free(best->dict);
  871. }
  872. best->dict = malloc(dictSize);
  873. if (!best->dict) {
  874. best->compressedSize = ERROR(GENERIC);
  875. best->dictSize = 0;
  876. ZSTD_pthread_cond_signal(&best->cond);
  877. ZSTD_pthread_mutex_unlock(&best->mutex);
  878. return;
  879. }
  880. }
  881. /* Save the dictionary, parameters, and size */
  882. if (dict) {
  883. memcpy(best->dict, dict, dictSize);
  884. best->dictSize = dictSize;
  885. best->parameters = parameters;
  886. best->compressedSize = compressedSize;
  887. }
  888. }
  889. if (liveJobs == 0) {
  890. ZSTD_pthread_cond_broadcast(&best->cond);
  891. }
  892. ZSTD_pthread_mutex_unlock(&best->mutex);
  893. }
  894. }
  895. COVER_dictSelection_t COVER_dictSelectionError(size_t error) {
  896. COVER_dictSelection_t selection = { NULL, 0, error };
  897. return selection;
  898. }
  899. unsigned COVER_dictSelectionIsError(COVER_dictSelection_t selection) {
  900. return (ZSTD_isError(selection.totalCompressedSize) || !selection.dictContent);
  901. }
  902. void COVER_dictSelectionFree(COVER_dictSelection_t selection){
  903. free(selection.dictContent);
  904. }
  905. COVER_dictSelection_t COVER_selectDict(BYTE* customDictContent, size_t dictBufferCapacity,
  906. size_t dictContentSize, const BYTE* samplesBuffer, const size_t* samplesSizes, unsigned nbFinalizeSamples,
  907. size_t nbCheckSamples, size_t nbSamples, ZDICT_cover_params_t params, size_t* offsets, size_t totalCompressedSize) {
  908. size_t largestDict = 0;
  909. size_t largestCompressed = 0;
  910. BYTE* customDictContentEnd = customDictContent + dictContentSize;
  911. BYTE * largestDictbuffer = (BYTE *)malloc(dictBufferCapacity);
  912. BYTE * candidateDictBuffer = (BYTE *)malloc(dictBufferCapacity);
  913. double regressionTolerance = ((double)params.shrinkDictMaxRegression / 100.0) + 1.00;
  914. if (!largestDictbuffer || !candidateDictBuffer) {
  915. free(largestDictbuffer);
  916. free(candidateDictBuffer);
  917. return COVER_dictSelectionError(dictContentSize);
  918. }
  919. /* Initial dictionary size and compressed size */
  920. memcpy(largestDictbuffer, customDictContent, dictContentSize);
  921. dictContentSize = ZDICT_finalizeDictionary(
  922. largestDictbuffer, dictBufferCapacity, customDictContent, dictContentSize,
  923. samplesBuffer, samplesSizes, nbFinalizeSamples, params.zParams);
  924. if (ZDICT_isError(dictContentSize)) {
  925. free(largestDictbuffer);
  926. free(candidateDictBuffer);
  927. return COVER_dictSelectionError(dictContentSize);
  928. }
  929. totalCompressedSize = COVER_checkTotalCompressedSize(params, samplesSizes,
  930. samplesBuffer, offsets,
  931. nbCheckSamples, nbSamples,
  932. largestDictbuffer, dictContentSize);
  933. if (ZSTD_isError(totalCompressedSize)) {
  934. free(largestDictbuffer);
  935. free(candidateDictBuffer);
  936. return COVER_dictSelectionError(totalCompressedSize);
  937. }
  938. if (params.shrinkDict == 0) {
  939. COVER_dictSelection_t selection = { largestDictbuffer, dictContentSize, totalCompressedSize };
  940. free(candidateDictBuffer);
  941. return selection;
  942. }
  943. largestDict = dictContentSize;
  944. largestCompressed = totalCompressedSize;
  945. dictContentSize = ZDICT_DICTSIZE_MIN;
  946. /* Largest dict is initially at least ZDICT_DICTSIZE_MIN */
  947. while (dictContentSize < largestDict) {
  948. memcpy(candidateDictBuffer, largestDictbuffer, largestDict);
  949. dictContentSize = ZDICT_finalizeDictionary(
  950. candidateDictBuffer, dictBufferCapacity, customDictContentEnd - dictContentSize, dictContentSize,
  951. samplesBuffer, samplesSizes, nbFinalizeSamples, params.zParams);
  952. if (ZDICT_isError(dictContentSize)) {
  953. free(largestDictbuffer);
  954. free(candidateDictBuffer);
  955. return COVER_dictSelectionError(dictContentSize);
  956. }
  957. totalCompressedSize = COVER_checkTotalCompressedSize(params, samplesSizes,
  958. samplesBuffer, offsets,
  959. nbCheckSamples, nbSamples,
  960. candidateDictBuffer, dictContentSize);
  961. if (ZSTD_isError(totalCompressedSize)) {
  962. free(largestDictbuffer);
  963. free(candidateDictBuffer);
  964. return COVER_dictSelectionError(totalCompressedSize);
  965. }
  966. if (totalCompressedSize <= largestCompressed * regressionTolerance) {
  967. COVER_dictSelection_t selection = { candidateDictBuffer, dictContentSize, totalCompressedSize };
  968. free(largestDictbuffer);
  969. return selection;
  970. }
  971. dictContentSize *= 2;
  972. }
  973. dictContentSize = largestDict;
  974. totalCompressedSize = largestCompressed;
  975. {
  976. COVER_dictSelection_t selection = { largestDictbuffer, dictContentSize, totalCompressedSize };
  977. free(candidateDictBuffer);
  978. return selection;
  979. }
  980. }
  981. /**
  982. * Parameters for COVER_tryParameters().
  983. */
  984. typedef struct COVER_tryParameters_data_s {
  985. const COVER_ctx_t *ctx;
  986. COVER_best_t *best;
  987. size_t dictBufferCapacity;
  988. ZDICT_cover_params_t parameters;
  989. } COVER_tryParameters_data_t;
  990. /**
  991. * Tries a set of parameters and updates the COVER_best_t with the results.
  992. * This function is thread safe if zstd is compiled with multithreaded support.
  993. * It takes its parameters as an *OWNING* opaque pointer to support threading.
  994. */
  995. static void COVER_tryParameters(void *opaque)
  996. {
  997. /* Save parameters as local variables */
  998. COVER_tryParameters_data_t *const data = (COVER_tryParameters_data_t*)opaque;
  999. const COVER_ctx_t *const ctx = data->ctx;
  1000. const ZDICT_cover_params_t parameters = data->parameters;
  1001. size_t dictBufferCapacity = data->dictBufferCapacity;
  1002. size_t totalCompressedSize = ERROR(GENERIC);
  1003. /* Allocate space for hash table, dict, and freqs */
  1004. COVER_map_t activeDmers;
  1005. BYTE* const dict = (BYTE*)malloc(dictBufferCapacity);
  1006. COVER_dictSelection_t selection = COVER_dictSelectionError(ERROR(GENERIC));
  1007. U32* const freqs = (U32*)malloc(ctx->suffixSize * sizeof(U32));
  1008. if (!COVER_map_init(&activeDmers, parameters.k - parameters.d + 1)) {
  1009. DISPLAYLEVEL(1, "Failed to allocate dmer map: out of memory\n");
  1010. goto _cleanup;
  1011. }
  1012. if (!dict || !freqs) {
  1013. DISPLAYLEVEL(1, "Failed to allocate buffers: out of memory\n");
  1014. goto _cleanup;
  1015. }
  1016. /* Copy the frequencies because we need to modify them */
  1017. memcpy(freqs, ctx->freqs, ctx->suffixSize * sizeof(U32));
  1018. /* Build the dictionary */
  1019. {
  1020. const size_t tail = COVER_buildDictionary(ctx, freqs, &activeDmers, dict,
  1021. dictBufferCapacity, parameters);
  1022. selection = COVER_selectDict(dict + tail, dictBufferCapacity, dictBufferCapacity - tail,
  1023. ctx->samples, ctx->samplesSizes, (unsigned)ctx->nbTrainSamples, ctx->nbTrainSamples, ctx->nbSamples, parameters, ctx->offsets,
  1024. totalCompressedSize);
  1025. if (COVER_dictSelectionIsError(selection)) {
  1026. DISPLAYLEVEL(1, "Failed to select dictionary\n");
  1027. goto _cleanup;
  1028. }
  1029. }
  1030. _cleanup:
  1031. free(dict);
  1032. COVER_best_finish(data->best, parameters, selection);
  1033. free(data);
  1034. COVER_map_destroy(&activeDmers);
  1035. COVER_dictSelectionFree(selection);
  1036. free(freqs);
  1037. }
  1038. ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover(
  1039. void* dictBuffer, size_t dictBufferCapacity, const void* samplesBuffer,
  1040. const size_t* samplesSizes, unsigned nbSamples,
  1041. ZDICT_cover_params_t* parameters)
  1042. {
  1043. /* constants */
  1044. const unsigned nbThreads = parameters->nbThreads;
  1045. const double splitPoint =
  1046. parameters->splitPoint <= 0.0 ? COVER_DEFAULT_SPLITPOINT : parameters->splitPoint;
  1047. const unsigned kMinD = parameters->d == 0 ? 6 : parameters->d;
  1048. const unsigned kMaxD = parameters->d == 0 ? 8 : parameters->d;
  1049. const unsigned kMinK = parameters->k == 0 ? 50 : parameters->k;
  1050. const unsigned kMaxK = parameters->k == 0 ? 2000 : parameters->k;
  1051. const unsigned kSteps = parameters->steps == 0 ? 40 : parameters->steps;
  1052. const unsigned kStepSize = MAX((kMaxK - kMinK) / kSteps, 1);
  1053. const unsigned kIterations =
  1054. (1 + (kMaxD - kMinD) / 2) * (1 + (kMaxK - kMinK) / kStepSize);
  1055. const unsigned shrinkDict = 0;
  1056. /* Local variables */
  1057. const int displayLevel = parameters->zParams.notificationLevel;
  1058. unsigned iteration = 1;
  1059. unsigned d;
  1060. unsigned k;
  1061. COVER_best_t best;
  1062. POOL_ctx *pool = NULL;
  1063. int warned = 0;
  1064. /* Checks */
  1065. if (splitPoint <= 0 || splitPoint > 1) {
  1066. LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect parameters\n");
  1067. return ERROR(parameter_outOfBound);
  1068. }
  1069. if (kMinK < kMaxD || kMaxK < kMinK) {
  1070. LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect parameters\n");
  1071. return ERROR(parameter_outOfBound);
  1072. }
  1073. if (nbSamples == 0) {
  1074. DISPLAYLEVEL(1, "Cover must have at least one input file\n");
  1075. return ERROR(srcSize_wrong);
  1076. }
  1077. if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) {
  1078. DISPLAYLEVEL(1, "dictBufferCapacity must be at least %u\n",
  1079. ZDICT_DICTSIZE_MIN);
  1080. return ERROR(dstSize_tooSmall);
  1081. }
  1082. if (nbThreads > 1) {
  1083. pool = POOL_create(nbThreads, 1);
  1084. if (!pool) {
  1085. return ERROR(memory_allocation);
  1086. }
  1087. }
  1088. /* Initialization */
  1089. COVER_best_init(&best);
  1090. /* Turn down global display level to clean up display at level 2 and below */
  1091. g_displayLevel = displayLevel == 0 ? 0 : displayLevel - 1;
  1092. /* Loop through d first because each new value needs a new context */
  1093. LOCALDISPLAYLEVEL(displayLevel, 2, "Trying %u different sets of parameters\n",
  1094. kIterations);
  1095. for (d = kMinD; d <= kMaxD; d += 2) {
  1096. /* Initialize the context for this value of d */
  1097. COVER_ctx_t ctx;
  1098. LOCALDISPLAYLEVEL(displayLevel, 3, "d=%u\n", d);
  1099. {
  1100. const size_t initVal = COVER_ctx_init(&ctx, samplesBuffer, samplesSizes, nbSamples, d, splitPoint);
  1101. if (ZSTD_isError(initVal)) {
  1102. LOCALDISPLAYLEVEL(displayLevel, 1, "Failed to initialize context\n");
  1103. COVER_best_destroy(&best);
  1104. POOL_free(pool);
  1105. return initVal;
  1106. }
  1107. }
  1108. if (!warned) {
  1109. COVER_warnOnSmallCorpus(dictBufferCapacity, ctx.suffixSize, displayLevel);
  1110. warned = 1;
  1111. }
  1112. /* Loop through k reusing the same context */
  1113. for (k = kMinK; k <= kMaxK; k += kStepSize) {
  1114. /* Prepare the arguments */
  1115. COVER_tryParameters_data_t *data = (COVER_tryParameters_data_t *)malloc(
  1116. sizeof(COVER_tryParameters_data_t));
  1117. LOCALDISPLAYLEVEL(displayLevel, 3, "k=%u\n", k);
  1118. if (!data) {
  1119. LOCALDISPLAYLEVEL(displayLevel, 1, "Failed to allocate parameters\n");
  1120. COVER_best_destroy(&best);
  1121. COVER_ctx_destroy(&ctx);
  1122. POOL_free(pool);
  1123. return ERROR(memory_allocation);
  1124. }
  1125. data->ctx = &ctx;
  1126. data->best = &best;
  1127. data->dictBufferCapacity = dictBufferCapacity;
  1128. data->parameters = *parameters;
  1129. data->parameters.k = k;
  1130. data->parameters.d = d;
  1131. data->parameters.splitPoint = splitPoint;
  1132. data->parameters.steps = kSteps;
  1133. data->parameters.shrinkDict = shrinkDict;
  1134. data->parameters.zParams.notificationLevel = g_displayLevel;
  1135. /* Check the parameters */
  1136. if (!COVER_checkParameters(data->parameters, dictBufferCapacity)) {
  1137. DISPLAYLEVEL(1, "Cover parameters incorrect\n");
  1138. free(data);
  1139. continue;
  1140. }
  1141. /* Call the function and pass ownership of data to it */
  1142. COVER_best_start(&best);
  1143. if (pool) {
  1144. POOL_add(pool, &COVER_tryParameters, data);
  1145. } else {
  1146. COVER_tryParameters(data);
  1147. }
  1148. /* Print status */
  1149. LOCALDISPLAYUPDATE(displayLevel, 2, "\r%u%% ",
  1150. (unsigned)((iteration * 100) / kIterations));
  1151. ++iteration;
  1152. }
  1153. COVER_best_wait(&best);
  1154. COVER_ctx_destroy(&ctx);
  1155. }
  1156. LOCALDISPLAYLEVEL(displayLevel, 2, "\r%79s\r", "");
  1157. /* Fill the output buffer and parameters with output of the best parameters */
  1158. {
  1159. const size_t dictSize = best.dictSize;
  1160. if (ZSTD_isError(best.compressedSize)) {
  1161. const size_t compressedSize = best.compressedSize;
  1162. COVER_best_destroy(&best);
  1163. POOL_free(pool);
  1164. return compressedSize;
  1165. }
  1166. *parameters = best.parameters;
  1167. memcpy(dictBuffer, best.dict, dictSize);
  1168. COVER_best_destroy(&best);
  1169. POOL_free(pool);
  1170. return dictSize;
  1171. }
  1172. }