| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158 | /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the * LICENSE file in the root directory of this source tree) and the GPLv2 (found * in the COPYING file in the root directory of this source tree). * You may select, at your option, one of the above-listed licenses. */#ifndef ZDICT_STATIC_LINKING_ONLY#  define ZDICT_STATIC_LINKING_ONLY#endif#include <stdio.h>  /* fprintf */#include <stdlib.h> /* malloc, free, qsort */#include <string.h> /* memset */#include <time.h>   /* clock */#include "../common/mem.h" /* read */#include "../common/pool.h"#include "../common/threading.h"#include "../common/zstd_internal.h" /* includes zstd.h */#include "../zdict.h"/** * COVER_best_t is used for two purposes: * 1. Synchronizing threads. * 2. Saving the best parameters and dictionary. * * All of the methods except COVER_best_init() are thread safe if zstd is * compiled with multithreaded support. */typedef struct COVER_best_s {  ZSTD_pthread_mutex_t mutex;  ZSTD_pthread_cond_t cond;  size_t liveJobs;  void *dict;  size_t dictSize;  ZDICT_cover_params_t parameters;  size_t compressedSize;} COVER_best_t;/** * A segment is a range in the source as well as the score of the segment. */typedef struct {  U32 begin;  U32 end;  U32 score;} COVER_segment_t;/** *Number of epochs and size of each epoch. */typedef struct {  U32 num;  U32 size;} COVER_epoch_info_t;/** * Struct used for the dictionary selection function. */typedef struct COVER_dictSelection {  BYTE* dictContent;  size_t dictSize;  size_t totalCompressedSize;} COVER_dictSelection_t;/** * Computes the number of epochs and the size of each epoch. * We will make sure that each epoch gets at least 10 * k bytes. * * The COVER algorithms divide the data up into epochs of equal size and * select one segment from each epoch. * * @param maxDictSize The maximum allowed dictionary size. * @param nbDmers     The number of dmers we are training on. * @param k           The parameter k (segment size). * @param passes      The target number of passes over the dmer corpus. *                    More passes means a better dictionary. */COVER_epoch_info_t COVER_computeEpochs(U32 maxDictSize, U32 nbDmers,                                       U32 k, U32 passes);/** * Warns the user when their corpus is too small. */void COVER_warnOnSmallCorpus(size_t maxDictSize, size_t nbDmers, int displayLevel);/** *  Checks total compressed size of a dictionary */size_t COVER_checkTotalCompressedSize(const ZDICT_cover_params_t parameters,                                      const size_t *samplesSizes, const BYTE *samples,                                      size_t *offsets,                                      size_t nbTrainSamples, size_t nbSamples,                                      BYTE *const dict, size_t dictBufferCapacity);/** * Returns the sum of the sample sizes. */size_t COVER_sum(const size_t *samplesSizes, unsigned nbSamples) ;/** * Initialize the `COVER_best_t`. */void COVER_best_init(COVER_best_t *best);/** * Wait until liveJobs == 0. */void COVER_best_wait(COVER_best_t *best);/** * Call COVER_best_wait() and then destroy the COVER_best_t. */void COVER_best_destroy(COVER_best_t *best);/** * Called when a thread is about to be launched. * Increments liveJobs. */void COVER_best_start(COVER_best_t *best);/** * Called when a thread finishes executing, both on error or success. * Decrements liveJobs and signals any waiting threads if liveJobs == 0. * If this dictionary is the best so far save it and its parameters. */void COVER_best_finish(COVER_best_t *best, ZDICT_cover_params_t parameters,                       COVER_dictSelection_t selection);/** * Error function for COVER_selectDict function. Checks if the return * value is an error. */unsigned COVER_dictSelectionIsError(COVER_dictSelection_t selection); /**  * Error function for COVER_selectDict function. Returns a struct where  * return.totalCompressedSize is a ZSTD error.  */COVER_dictSelection_t COVER_dictSelectionError(size_t error);/** * Always call after selectDict is called to free up used memory from * newly created dictionary. */void COVER_dictSelectionFree(COVER_dictSelection_t selection);/** * Called to finalize the dictionary and select one based on whether or not * the shrink-dict flag was enabled. If enabled the dictionary used is the * smallest dictionary within a specified regression of the compressed size * from the largest dictionary. */ COVER_dictSelection_t COVER_selectDict(BYTE* customDictContent, size_t dictBufferCapacity,                       size_t dictContentSize, const BYTE* samplesBuffer, const size_t* samplesSizes, unsigned nbFinalizeSamples,                       size_t nbCheckSamples, size_t nbSamples, ZDICT_cover_params_t params, size_t* offsets, size_t totalCompressedSize);
 |