falloc.go 46 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999
  1. // Copyright 2014 The lldb Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. // The storage space management.
  5. package lldb
  6. import (
  7. "bytes"
  8. "errors"
  9. "fmt"
  10. "io"
  11. "sort"
  12. "strings"
  13. "sync"
  14. "github.com/cznic/internal/buffer"
  15. "github.com/cznic/mathutil"
  16. "github.com/cznic/zappy"
  17. )
  18. const (
  19. maxBuf = maxRq + 20
  20. )
  21. // Options are passed to the NewAllocator to amend some configuration. The
  22. // compatibility promise is the same as of struct types in the Go standard
  23. // library - introducing changes can be made only by adding new exported
  24. // fields, which is backward compatible as long as client code uses field names
  25. // to assign values of imported struct types literals.
  26. //
  27. // NOTE: No options are currently defined.
  28. type Options struct{}
  29. // AllocStats record statistics about a Filer. It can be optionally filled by
  30. // Allocator.Verify, if successful.
  31. type AllocStats struct {
  32. Handles int64 // total valid handles in use
  33. Compression int64 // number of compressed blocks
  34. TotalAtoms int64 // total number of atoms == AllocAtoms + FreeAtoms
  35. AllocBytes int64 // bytes allocated (after decompression, if/where used)
  36. AllocAtoms int64 // atoms allocated/used, including relocation atoms
  37. Relocations int64 // number of relocated used blocks
  38. FreeAtoms int64 // atoms unused
  39. AllocMap map[int64]int64 // allocated block size in atoms -> count of such blocks
  40. FreeMap map[int64]int64 // free block size in atoms -> count of such blocks
  41. }
  42. /*
  43. Allocator implements "raw" storage space management (allocation and
  44. deallocation) for a low level of a DB engine. The storage is an abstraction
  45. provided by a Filer.
  46. The terms MUST or MUST NOT, if/where used in the documentation of Allocator,
  47. written in all caps as seen here, are a requirement for any possible
  48. alternative implementations aiming for compatibility with this one.
  49. Filer file
  50. A Filer file, or simply 'file', is a linear, contiguous sequence of blocks.
  51. Blocks may be either free (currently unused) or allocated (currently used).
  52. Some blocks may eventually become virtual in a sense as they may not be
  53. realized in the storage (sparse files).
  54. Free Lists Table
  55. File starts with a FLT. This table records heads of 14 doubly linked free
  56. lists. The zero based index (I) vs minimal size of free blocks in that list,
  57. except the last one which registers free blocks of size 4112+ atoms:
  58. MinSize == 2^I
  59. For example 0 -> 1, 1 -> 2, ... 12 -> 4096.
  60. Each entry in the FLT is 8 bytes in netwtork order, MSB MUST be zero, ie. the
  61. slot value is effectively only 7 bytes. The value is the handle of the head of
  62. the respective doubly linked free list. The FLT size is 14*8 == 112(0x70)
  63. bytes. If the free blocks list for any particular size is empty, the respective
  64. FLT slot is zero. Sizes of free blocks in one list MUST NOT overlap with sizes
  65. of free lists in other list. For example, even though a free block of size 2
  66. technically is of minimal size >= 1, it MUST NOT be put to the list for slot 0
  67. (minimal size 1), but in slot 1( minimal size 2).
  68. slot 0: sizes [1, 2)
  69. slot 1: sizes [2, 4)
  70. slot 2: sizes [4, 8)
  71. ...
  72. slot 11: sizes [2048, 4096)
  73. slot 12: sizes [4096, 4112)
  74. slot 13: sizes [4112, inf)
  75. The last FLT slot collects all free blocks bigger than its minimal size. That
  76. still respects the 'no overlap' invariant.
  77. File blocks
  78. A block is a linear, contiguous sequence of atoms. The first and last atoms of
  79. a block provide information about, for example, whether the block is free or
  80. used, what is the size of the block, etc. Details are discussed elsewhere. The
  81. first block of a file starts immediately after FLT, ie. at file offset
  82. 112(0x70).
  83. Block atoms
  84. An atom is a fixed size piece of a block (and thus of a file too); it is 16
  85. bytes long. A consequence is that for a valid file:
  86. filesize == 0 (mod 16)
  87. The first atom of the first block is considered to be atom #1.
  88. Block handles
  89. A handle is an integer referring to a block. The reference is the number of the
  90. atom the block starts with. Put in other way:
  91. handle == offset/16 - 6
  92. offset == 16 * (handle + 6)
  93. `offset` is the offset of the first byte of the block, measured in bytes
  94. - as in fseek(3). Handle has type `int64`, but only the lower 7 bytes may be
  95. nonzero while referring to a block, both in code as well as when persisted in
  96. the the file's internal bookkeeping structures - see 'Block types' bellow. So a
  97. handle is effectively only `uint56`. This also means that the maximum usable
  98. size of a file is 2^56 atoms. That is 2^60 bytes == 1 exabyte (10^18 bytes).
  99. Nil handles
  100. A handle with numeric value of '0' refers to no block.
  101. Zero padding
  102. A padding is used to round-up a block size to be a whole number of atoms. Any
  103. padding, if present, MUST be all zero bytes. Note that the size of padding is
  104. in [0, 15].
  105. Content wiping
  106. When a block is deallocated, its data content is not wiped as the added
  107. overhead may be substantial while not necessarily needed. Client code should
  108. however overwrite the content of any block having sensitive data with eg. zeros
  109. (good compression) - before deallocating the block.
  110. Block tags
  111. Every block is tagged in its first byte (a head tag) and last byte (tail tag).
  112. Block types are:
  113. 1. Short content used block (head tags 0x00-0xFB)
  114. 2. Long content used block (head tag 0xFC)
  115. 3. Relocated used block (head tag 0xFD)
  116. 4. Short, single atom, free block (head tag 0xFE)
  117. 5. Long free block (head tag 0xFF)
  118. Note: Relocated used block, 3. above (head tag 0xFD) MUST NOT refer to blocks
  119. other then 1. or 2. above (head tags 0x00-0xFC).
  120. Content blocks
  121. Used blocks (head tags 0x00-0xFC) tail tag distinguish used/unused block and if
  122. content is compressed or not.
  123. Content compression
  124. The tail flag of an used block is one of
  125. CC == 0 // Content is not compressed.
  126. CC == 1 // Content is in zappy compression format.
  127. If compression of written content is enabled, there are two cases: If
  128. compressed size < original size then the compressed content should be written
  129. if it will save at least one atom of the block. If compressed size >= original
  130. size then the compressed content should not be used.
  131. It's recommended to use compression. For example the BTrees implementation
  132. assumes compression is used. Using compression may cause a slowdown in some
  133. cases while it may as well cause a speedup.
  134. Short content block
  135. Short content block carries content of length between N == 0(0x00) and N ==
  136. 251(0xFB) bytes.
  137. |<-first atom start ... last atom end->|
  138. +---++-- ... --+-- ... --++------+
  139. | 0 || 1... | 0x*...0x*E || 0x*F |
  140. +---++-- ... --+-- ... --++------+
  141. | N || content | padding || CC |
  142. +---++-- ... --+-- ... --++------+
  143. A == (N+1)/16 + 1 // The number of atoms in the block [1, 16]
  144. padding == 15 - (N+1)%16 // Length of the zero padding
  145. Long content block
  146. Long content block carries content of length between N == 252(0xFC) and N ==
  147. 65787(0x100FB) bytes.
  148. |<-first atom start ... last atom end->|
  149. +------++------+-- ... --+-- ... --++------+
  150. | 0 || 1..2 | 3... | 0x*...0x*E || 0x*F |
  151. +------++------+-- ... --+-- ... --++------+
  152. | 0xFC || M | content | padding || CC |
  153. +------++------+-- ... --+-- ... --++------+
  154. A == (N+3)/16 + 1 // The number of atoms in the block [16, 4112]
  155. M == N % 0x10000 // Stored as 2 bytes in network byte order
  156. padding == 15 - (N+3)%16 // Length of the zero padding
  157. Relocated used block
  158. Relocated block allows to permanently assign a handle to some content and
  159. resize the content anytime afterwards without having to update all the possible
  160. existing references; the handle can be constant while the content size may be
  161. dynamic. When relocating a block, any space left by the original block content,
  162. above this single atom block, MUST be reclaimed.
  163. Relocations MUST point only to a used short or long block == blocks with tags
  164. 0x00...0xFC.
  165. +------++------+---------++----+
  166. | 0 || 1..7 | 8...14 || 15 |
  167. +------++------+---------++----+
  168. | 0xFD || H | padding || 0 |
  169. +------++------+---------++----+
  170. H is the handle of the relocated block in network byte order.
  171. Free blocks
  172. Free blocks are the result of space deallocation. Free blocks are organized in
  173. one or more doubly linked lists, abstracted by the FLT interface. Free blocks
  174. MUST be "registered" by putting them in such list. Allocator MUST reuse a big
  175. enough free block, if such exists, before growing the file size. When a free
  176. block is created by deallocation or reallocation it MUST be joined with any
  177. adjacently existing free blocks before "registering". If the resulting free
  178. block is now a last block of a file, the free block MUST be discarded and the
  179. file size MUST be truncated accordingly instead. Put differently, there MUST
  180. NOT ever be a free block at the file end.
  181. A single free atom
  182. Is an unused block of size 1 atom.
  183. +------++------+--------++------+
  184. | 0 || 1..7 | 8...14 || 15 |
  185. +------++------+--------++------+
  186. | 0xFE || P | N || 0xFE |
  187. +------++------+--------++------+
  188. P and N, stored in network byte order, are the previous and next free block
  189. handles in the doubly linked list to which this free block belongs.
  190. A long unused block
  191. Is an unused block of size > 1 atom.
  192. +------++------+-------+---------+- ... -+----------++------+
  193. | 0 || 1..7 | 8..14 | 15...21 | | Z-7..Z-1 || Z |
  194. +------++------+-------+---------+- ... -+----------++------+
  195. | 0xFF || S | P | N | Leak | S || 0xFF |
  196. +------++------+-------+---------+- ... -+----------++------+
  197. Z == 16 * S - 1
  198. S is the size of this unused block in atoms. P and N are the previous and next
  199. free block handles in the doubly linked list to which this free block belongs.
  200. Leak contains any data the block had before deallocating this block. See also
  201. the subtitle 'Content wiping' above. S, P and N are stored in network byte
  202. order. Large free blocks may trigger a consideration of file hole punching of
  203. the Leak field - for some value of 'large'.
  204. Note: Allocator methods vs CRUD[1]:
  205. Alloc [C]reate
  206. Get [R]ead
  207. Realloc [U]pdate
  208. Free [D]elete
  209. Note: No Allocator method returns io.EOF.
  210. [1]: http://en.wikipedia.org/wiki/Create,_read,_update_and_delete
  211. */
  212. type Allocator struct {
  213. f Filer
  214. flt flt
  215. Compress bool // enables content compression
  216. cache cache
  217. m map[int64]*node
  218. lru lst
  219. expHit int64
  220. expMiss int64
  221. cacheSz int
  222. hit uint16
  223. miss uint16
  224. mu sync.Mutex
  225. }
  226. // NewAllocator returns a new Allocator. To open an existing file, pass its
  227. // Filer. To create a "new" file, pass a Filer which file is of zero size.
  228. func NewAllocator(f Filer, opts *Options) (a *Allocator, err error) {
  229. if opts == nil { // Enforce *Options is always passed
  230. return nil, errors.New("NewAllocator: nil opts passed")
  231. }
  232. a = &Allocator{
  233. f: f,
  234. cacheSz: 10,
  235. }
  236. a.cinit()
  237. switch x := f.(type) {
  238. case *RollbackFiler:
  239. x.afterRollback = func() error {
  240. a.cinit()
  241. return a.flt.load(a.f, 0)
  242. }
  243. case *ACIDFiler0:
  244. x.RollbackFiler.afterRollback = func() error {
  245. a.cinit()
  246. return a.flt.load(a.f, 0)
  247. }
  248. }
  249. sz, err := f.Size()
  250. if err != nil {
  251. return
  252. }
  253. a.flt.init()
  254. if sz == 0 {
  255. var b [fltSz]byte
  256. if err = a.f.BeginUpdate(); err != nil {
  257. return
  258. }
  259. if _, err = f.WriteAt(b[:], 0); err != nil {
  260. a.f.Rollback()
  261. return
  262. }
  263. return a, a.f.EndUpdate()
  264. }
  265. return a, a.flt.load(f, 0)
  266. }
  267. // CacheStats reports cache statistics.
  268. //
  269. //TODO return a struct perhaps.
  270. func (a *Allocator) CacheStats() (buffersUsed, buffersTotal int, bytesUsed, bytesTotal, hits, misses int64) {
  271. buffersUsed = len(a.m)
  272. buffersTotal = buffersUsed + len(a.cache)
  273. bytesUsed = a.lru.size()
  274. bytesTotal = bytesUsed + a.cache.size()
  275. hits = a.expHit
  276. misses = a.expMiss
  277. return
  278. }
  279. func (a *Allocator) cinit() {
  280. for h, n := range a.m {
  281. a.cache.put(a.lru.remove(n))
  282. delete(a.m, h)
  283. }
  284. if a.m == nil {
  285. a.m = map[int64]*node{}
  286. }
  287. }
  288. func (a *Allocator) cadd(b []byte, h int64) {
  289. if len(a.m) < a.cacheSz {
  290. n := a.cache.get(len(b))
  291. n.h = h
  292. copy(n.b, b)
  293. a.m[h] = a.lru.pushFront(n)
  294. return
  295. }
  296. // cache full
  297. delete(a.m, a.cache.put(a.lru.removeBack()).h)
  298. n := a.cache.get(len(b))
  299. n.h = h
  300. copy(n.b, b)
  301. a.m[h] = a.lru.pushFront(n)
  302. return
  303. }
  304. func (a *Allocator) cfree(h int64) {
  305. n, ok := a.m[h]
  306. if !ok { // must have been evicted
  307. return
  308. }
  309. a.cache.put(a.lru.remove(n))
  310. delete(a.m, h)
  311. }
  312. // Alloc allocates storage space for b and returns the handle of the new block
  313. // with content set to b or an error, if any. The returned handle is valid only
  314. // while the block is used - until the block is deallocated. No two valid
  315. // handles share the same value within the same Filer, but any value of a
  316. // handle not referring to any used block may become valid any time as a result
  317. // of Alloc.
  318. //
  319. // Invoking Alloc on an empty Allocator is guaranteed to return handle with
  320. // value 1. The intended use of content of handle 1 is a root "directory" of
  321. // other data held by an Allocator.
  322. //
  323. // Passing handles not obtained initially from Alloc or not anymore valid to
  324. // any other Allocator methods can result in an irreparably corrupted database.
  325. func (a *Allocator) Alloc(b []byte) (handle int64, err error) {
  326. pbuf := buffer.Get(zappy.MaxEncodedLen(len(b)))
  327. defer buffer.Put(pbuf)
  328. buf := *pbuf
  329. buf, _, cc, err := a.makeUsedBlock(buf, b)
  330. if err != nil {
  331. return
  332. }
  333. if handle, err = a.alloc(buf, cc); err == nil {
  334. a.cadd(b, handle)
  335. }
  336. return
  337. }
  338. func (a *Allocator) alloc(b []byte, cc byte) (h int64, err error) {
  339. rqAtoms := n2atoms(len(b))
  340. if h = a.flt.find(rqAtoms); h == 0 { // must grow
  341. var sz int64
  342. if sz, err = a.f.Size(); err != nil {
  343. return
  344. }
  345. h = off2h(sz)
  346. err = a.writeUsedBlock(h, cc, b)
  347. return
  348. }
  349. // Handle is the first item of a free blocks list.
  350. tag, s, prev, next, err := a.nfo(h)
  351. if err != nil {
  352. return
  353. }
  354. if tag != tagFreeShort && tag != tagFreeLong {
  355. err = &ErrILSEQ{Type: ErrExpFreeTag, Off: h2off(h), Arg: int64(tag)}
  356. return
  357. }
  358. if prev != 0 {
  359. err = &ErrILSEQ{Type: ErrHead, Off: h2off(h), Arg: prev}
  360. return
  361. }
  362. if s < int64(rqAtoms) {
  363. err = &ErrILSEQ{Type: ErrSmall, Arg: int64(rqAtoms), Arg2: s, Off: h2off(h)}
  364. return
  365. }
  366. if err = a.unlink(h, s, prev, next); err != nil {
  367. return
  368. }
  369. if s > int64(rqAtoms) {
  370. freeH := h + int64(rqAtoms)
  371. freeAtoms := s - int64(rqAtoms)
  372. if err = a.link(freeH, freeAtoms); err != nil {
  373. return
  374. }
  375. }
  376. return h, a.writeUsedBlock(h, cc, b)
  377. }
  378. // Free deallocates the block referred to by handle or returns an error, if
  379. // any.
  380. //
  381. // After Free succeeds, handle is invalid and must not be used.
  382. //
  383. // Handle must have been obtained initially from Alloc and must be still valid,
  384. // otherwise a database may get irreparably corrupted.
  385. func (a *Allocator) Free(handle int64) (err error) {
  386. if handle <= 0 || handle > maxHandle {
  387. return &ErrINVAL{"Allocator.Free: handle out of limits", handle}
  388. }
  389. a.cfree(handle)
  390. return a.free(handle, 0, true)
  391. }
  392. func (a *Allocator) free(h, from int64, acceptRelocs bool) (err error) {
  393. tag, atoms, _, n, err := a.nfo(h)
  394. if err != nil {
  395. return
  396. }
  397. switch tag {
  398. default:
  399. // nop
  400. case tagUsedLong:
  401. // nop
  402. case tagUsedRelocated:
  403. if !acceptRelocs {
  404. return &ErrILSEQ{Type: ErrUnexpReloc, Off: h2off(h), Arg: h2off(from)}
  405. }
  406. if err = a.free(n, h, false); err != nil {
  407. return
  408. }
  409. case tagFreeShort, tagFreeLong:
  410. return &ErrINVAL{"Allocator.Free: attempt to free a free block at off", h2off(h)}
  411. }
  412. return a.free2(h, atoms)
  413. }
  414. func (a *Allocator) free2(h, atoms int64) (err error) {
  415. sz, err := a.f.Size()
  416. if err != nil {
  417. return
  418. }
  419. ltag, latoms, lp, ln, err := a.leftNfo(h)
  420. if err != nil {
  421. return
  422. }
  423. if ltag != tagFreeShort && ltag != tagFreeLong {
  424. latoms = 0
  425. }
  426. var rtag byte
  427. var ratoms, rp, rn int64
  428. isTail := h2off(h)+atoms*16 == sz
  429. if !isTail {
  430. if rtag, ratoms, rp, rn, err = a.nfo(h + atoms); err != nil {
  431. return
  432. }
  433. }
  434. if rtag != tagFreeShort && rtag != tagFreeLong {
  435. ratoms = 0
  436. }
  437. switch {
  438. case latoms == 0 && ratoms == 0:
  439. // -> isolated <-
  440. if isTail { // cut tail
  441. return a.f.Truncate(h2off(h))
  442. }
  443. return a.link(h, atoms)
  444. case latoms == 0 && ratoms != 0:
  445. // right join ->
  446. if err = a.unlink(h+atoms, ratoms, rp, rn); err != nil {
  447. return
  448. }
  449. return a.link(h, atoms+ratoms)
  450. case latoms != 0 && ratoms == 0:
  451. // <- left join
  452. if err = a.unlink(h-latoms, latoms, lp, ln); err != nil {
  453. return
  454. }
  455. if isTail {
  456. return a.f.Truncate(h2off(h - latoms))
  457. }
  458. return a.link(h-latoms, latoms+atoms)
  459. }
  460. // case latoms != 0 && ratoms != 0:
  461. // <- middle join ->
  462. lh, rh := h-latoms, h+atoms
  463. if err = a.unlink(lh, latoms, lp, ln); err != nil {
  464. return
  465. }
  466. // Prev unlink may have invalidated rp or rn
  467. if _, _, rp, rn, err = a.nfo(rh); err != nil {
  468. return
  469. }
  470. if err = a.unlink(rh, ratoms, rp, rn); err != nil {
  471. return
  472. }
  473. return a.link(h-latoms, latoms+atoms+ratoms)
  474. }
  475. // Add a free block h to the appropriate free list
  476. func (a *Allocator) link(h, atoms int64) (err error) {
  477. if err = a.makeFree(h, atoms, 0, a.flt.head(atoms)); err != nil {
  478. return
  479. }
  480. return a.flt.setHead(h, atoms, a.f)
  481. }
  482. // Remove free block h from the free list
  483. func (a *Allocator) unlink(h, atoms, p, n int64) (err error) {
  484. switch {
  485. case p == 0 && n == 0:
  486. // single item list, must be head
  487. return a.flt.setHead(0, atoms, a.f)
  488. case p == 0 && n != 0:
  489. // head of list (has next item[s])
  490. if err = a.prev(n, 0); err != nil {
  491. return
  492. }
  493. // new head
  494. return a.flt.setHead(n, atoms, a.f)
  495. case p != 0 && n == 0:
  496. // last item in list
  497. return a.next(p, 0)
  498. }
  499. // case p != 0 && n != 0:
  500. // intermediate item in a list
  501. if err = a.next(p, n); err != nil {
  502. return
  503. }
  504. return a.prev(n, p)
  505. }
  506. //TODO remove ?
  507. // Return len(slice) == n, reuse src if possible.
  508. func need(n int, src []byte) []byte {
  509. if cap(src) < n {
  510. return *buffer.Get(n)
  511. }
  512. return src[:n]
  513. }
  514. // Get returns the data content of a block referred to by handle or an error if
  515. // any. The returned slice may be a sub-slice of buf if buf was large enough
  516. // to hold the entire content. Otherwise, a newly allocated slice will be
  517. // returned. It is valid to pass a nil buf.
  518. //
  519. // If the content was stored using compression then it is transparently
  520. // returned decompressed.
  521. //
  522. // Handle must have been obtained initially from Alloc and must be still valid,
  523. // otherwise invalid data may be returned without detecting the error.
  524. //
  525. // Get is safe for concurrent access by multiple goroutines iff no other
  526. // goroutine mutates the DB.
  527. func (a *Allocator) Get(buf []byte, handle int64) (b []byte, err error) {
  528. buf = buf[:cap(buf)]
  529. a.mu.Lock() // X1+
  530. if n, ok := a.m[handle]; ok {
  531. a.lru.moveToFront(n)
  532. b = need(len(n.b), buf)
  533. copy(b, n.b)
  534. a.expHit++
  535. a.hit++
  536. a.mu.Unlock() // X1-
  537. return
  538. }
  539. a.expMiss++
  540. a.miss++
  541. if a.miss > 10 && len(a.m) < 500 {
  542. if 100*a.hit/a.miss < 95 {
  543. a.cacheSz++
  544. }
  545. a.hit, a.miss = 0, 0
  546. }
  547. a.mu.Unlock() // X1-
  548. defer func(h int64) {
  549. if err == nil {
  550. a.mu.Lock() // X2+
  551. a.cadd(b, h)
  552. a.mu.Unlock() // X2-
  553. }
  554. }(handle)
  555. pfirst := buffer.Get(16)
  556. defer buffer.Put(pfirst)
  557. first := *pfirst
  558. relocated := false
  559. relocSrc := handle
  560. reloc:
  561. if handle <= 0 || handle > maxHandle {
  562. return nil, &ErrINVAL{"Allocator.Get: handle out of limits", handle}
  563. }
  564. off := h2off(handle)
  565. if err = a.read(first, off); err != nil {
  566. return
  567. }
  568. switch tag := first[0]; tag {
  569. default:
  570. dlen := int(tag)
  571. atoms := n2atoms(dlen)
  572. switch atoms {
  573. case 1:
  574. switch tag := first[15]; tag {
  575. default:
  576. return nil, &ErrILSEQ{Type: ErrTailTag, Off: off, Arg: int64(tag)}
  577. case tagNotCompressed:
  578. b = need(dlen, buf)
  579. copy(b, first[1:])
  580. return
  581. case tagCompressed:
  582. return zappy.Decode(buf, first[1:dlen+1])
  583. }
  584. default:
  585. pcc := buffer.Get(1)
  586. defer buffer.Put(pcc)
  587. cc := *pcc
  588. dlen := int(tag)
  589. atoms := n2atoms(dlen)
  590. tailOff := off + 16*int64(atoms) - 1
  591. if err = a.read(cc, tailOff); err != nil {
  592. return
  593. }
  594. switch tag := cc[0]; tag {
  595. default:
  596. return nil, &ErrILSEQ{Type: ErrTailTag, Off: off, Arg: int64(tag)}
  597. case tagNotCompressed:
  598. b = need(dlen, buf)
  599. off += 1
  600. if err = a.read(b, off); err != nil {
  601. b = buf[:0]
  602. }
  603. return
  604. case tagCompressed:
  605. pzbuf := buffer.Get(dlen)
  606. defer buffer.Put(pzbuf)
  607. zbuf := *pzbuf
  608. off += 1
  609. if err = a.read(zbuf, off); err != nil {
  610. return buf[:0], err
  611. }
  612. return zappy.Decode(buf, zbuf)
  613. }
  614. }
  615. case 0:
  616. return buf[:0], nil
  617. case tagUsedLong:
  618. pcc := buffer.Get(1)
  619. defer buffer.Put(pcc)
  620. cc := *pcc
  621. dlen := m2n(int(first[1])<<8 | int(first[2]))
  622. atoms := n2atoms(dlen)
  623. tailOff := off + 16*int64(atoms) - 1
  624. if err = a.read(cc, tailOff); err != nil {
  625. return
  626. }
  627. switch tag := cc[0]; tag {
  628. default:
  629. return nil, &ErrILSEQ{Type: ErrTailTag, Off: off, Arg: int64(tag)}
  630. case tagNotCompressed:
  631. b = need(dlen, buf)
  632. off += 3
  633. if err = a.read(b, off); err != nil {
  634. b = buf[:0]
  635. }
  636. return
  637. case tagCompressed:
  638. pzbuf := buffer.Get(dlen)
  639. defer buffer.Put(pzbuf)
  640. zbuf := *pzbuf
  641. off += 3
  642. if err = a.read(zbuf, off); err != nil {
  643. return buf[:0], err
  644. }
  645. return zappy.Decode(buf, zbuf)
  646. }
  647. case tagFreeShort, tagFreeLong:
  648. return nil, &ErrILSEQ{Type: ErrExpUsedTag, Off: off, Arg: int64(tag)}
  649. case tagUsedRelocated:
  650. if relocated {
  651. return nil, &ErrILSEQ{Type: ErrUnexpReloc, Off: off, Arg: relocSrc}
  652. }
  653. handle = b2h(first[1:])
  654. relocated = true
  655. goto reloc
  656. }
  657. }
  658. var reallocTestHook bool
  659. // Realloc sets the content of a block referred to by handle or returns an
  660. // error, if any.
  661. //
  662. // Handle must have been obtained initially from Alloc and must be still valid,
  663. // otherwise a database may get irreparably corrupted.
  664. func (a *Allocator) Realloc(handle int64, b []byte) (err error) {
  665. if handle <= 0 || handle > maxHandle {
  666. return &ErrINVAL{"Realloc: handle out of limits", handle}
  667. }
  668. a.cfree(handle)
  669. if err = a.realloc(handle, b); err != nil {
  670. return
  671. }
  672. if reallocTestHook {
  673. if err = cacheAudit(a.m, &a.lru); err != nil {
  674. return
  675. }
  676. }
  677. a.cadd(b, handle)
  678. return
  679. }
  680. func (a *Allocator) realloc(handle int64, b []byte) (err error) {
  681. var dlen, needAtoms0 int
  682. pb8 := buffer.Get(8)
  683. defer buffer.Put(pb8)
  684. b8 := *pb8
  685. pdst := buffer.Get(zappy.MaxEncodedLen(len(b)))
  686. defer buffer.Put(pdst)
  687. dst := *pdst
  688. b, needAtoms0, cc, err := a.makeUsedBlock(dst, b)
  689. if err != nil {
  690. return
  691. }
  692. needAtoms := int64(needAtoms0)
  693. off := h2off(handle)
  694. if err = a.read(b8[:], off); err != nil {
  695. return
  696. }
  697. switch tag := b8[0]; tag {
  698. default:
  699. dlen = int(b8[0])
  700. case tagUsedLong:
  701. dlen = m2n(int(b8[1])<<8 | int(b8[2]))
  702. case tagUsedRelocated:
  703. if err = a.free(b2h(b8[1:]), handle, false); err != nil {
  704. return err
  705. }
  706. dlen = 0
  707. case tagFreeShort, tagFreeLong:
  708. return &ErrINVAL{"Allocator.Realloc: invalid handle", handle}
  709. }
  710. atoms := int64(n2atoms(dlen))
  711. retry:
  712. switch {
  713. case needAtoms < atoms:
  714. // in place shrink
  715. if err = a.writeUsedBlock(handle, cc, b); err != nil {
  716. return
  717. }
  718. fh, fa := handle+needAtoms, atoms-needAtoms
  719. sz, err := a.f.Size()
  720. if err != nil {
  721. return err
  722. }
  723. if h2off(fh)+16*fa == sz {
  724. return a.f.Truncate(h2off(fh))
  725. }
  726. return a.free2(fh, fa)
  727. case needAtoms == atoms:
  728. // in place replace
  729. return a.writeUsedBlock(handle, cc, b)
  730. }
  731. // case needAtoms > atoms:
  732. // in place extend or relocate
  733. var sz int64
  734. if sz, err = a.f.Size(); err != nil {
  735. return
  736. }
  737. off = h2off(handle)
  738. switch {
  739. case off+atoms*16 == sz:
  740. // relocating tail block - shortcut
  741. return a.writeUsedBlock(handle, cc, b)
  742. default:
  743. if off+atoms*16 < sz {
  744. // handle is not a tail block, check right neighbour
  745. rh := handle + atoms
  746. rtag, ratoms, p, n, e := a.nfo(rh)
  747. if e != nil {
  748. return e
  749. }
  750. if rtag == tagFreeShort || rtag == tagFreeLong {
  751. // Right neighbour is a free block
  752. if needAtoms <= atoms+ratoms {
  753. // can expand in place
  754. if err = a.unlink(rh, ratoms, p, n); err != nil {
  755. return
  756. }
  757. atoms += ratoms
  758. goto retry
  759. }
  760. }
  761. }
  762. }
  763. if atoms > 1 {
  764. if err = a.realloc(handle, nil); err != nil {
  765. return
  766. }
  767. }
  768. var newH int64
  769. if newH, err = a.alloc(b, cc); err != nil {
  770. return err
  771. }
  772. prb := buffer.CGet(16)
  773. defer buffer.Put(prb)
  774. rb := *prb
  775. rb[0] = tagUsedRelocated
  776. h2b(rb[1:], newH)
  777. if err = a.writeAt(rb[:], h2off(handle)); err != nil {
  778. return
  779. }
  780. return a.writeUsedBlock(newH, cc, b)
  781. }
  782. func (a *Allocator) writeAt(b []byte, off int64) (err error) {
  783. var n int
  784. if n, err = a.f.WriteAt(b, off); err != nil {
  785. return
  786. }
  787. if n != len(b) {
  788. err = io.ErrShortWrite
  789. }
  790. return
  791. }
  792. func (a *Allocator) write(off int64, b ...[]byte) (err error) {
  793. rq := 0
  794. for _, part := range b {
  795. rq += len(part)
  796. }
  797. pbuf := buffer.Get(rq)
  798. defer buffer.Put(pbuf)
  799. buf := *pbuf
  800. buf = buf[:0]
  801. for _, part := range b {
  802. buf = append(buf, part...)
  803. }
  804. return a.writeAt(buf, off)
  805. }
  806. func (a *Allocator) read(b []byte, off int64) (err error) {
  807. var rn int
  808. if rn, err = a.f.ReadAt(b, off); rn != len(b) {
  809. return &ErrILSEQ{Type: ErrOther, Off: off, More: err}
  810. }
  811. return nil
  812. }
  813. // nfo returns h's tag. If it's a free block then return also (s)ize (in
  814. // atoms), (p)rev and (n)ext. If it's a used block then only (s)ize is returned
  815. // (again in atoms). If it's a used relocate block then (n)ext is set to the
  816. // relocation target handle.
  817. func (a *Allocator) nfo(h int64) (tag byte, s, p, n int64, err error) {
  818. off := h2off(h)
  819. rq := int64(22)
  820. sz, err := a.f.Size()
  821. if err != nil {
  822. return
  823. }
  824. if off+rq >= sz {
  825. if rq = sz - off; rq < 15 {
  826. err = io.ErrUnexpectedEOF
  827. return
  828. }
  829. }
  830. pbuf := buffer.Get(22)
  831. defer buffer.Put(pbuf)
  832. buf := *pbuf
  833. if err = a.read(buf[:rq], off); err != nil {
  834. return
  835. }
  836. switch tag = buf[0]; tag {
  837. default:
  838. s = int64(n2atoms(int(tag)))
  839. case tagUsedLong:
  840. s = int64(n2atoms(m2n(int(buf[1])<<8 | int(buf[2]))))
  841. case tagFreeLong:
  842. if rq < 22 {
  843. err = io.ErrUnexpectedEOF
  844. return
  845. }
  846. s, p, n = b2h(buf[1:]), b2h(buf[8:]), b2h(buf[15:])
  847. case tagUsedRelocated:
  848. s, n = 1, b2h(buf[1:])
  849. case tagFreeShort:
  850. s, p, n = 1, b2h(buf[1:]), b2h(buf[8:])
  851. }
  852. return
  853. }
  854. // leftNfo returns nfo for h's left neighbor if h > 1 and the left neighbor is
  855. // a free block. Otherwise all zero values are returned instead.
  856. func (a *Allocator) leftNfo(h int64) (tag byte, s, p, n int64, err error) {
  857. if !(h > 1) {
  858. return
  859. }
  860. pbuf := buffer.Get(8)
  861. defer buffer.Put(pbuf)
  862. buf := *pbuf
  863. off := h2off(h)
  864. if err = a.read(buf[:], off-8); err != nil {
  865. return
  866. }
  867. switch tag := buf[7]; tag {
  868. case tagFreeShort:
  869. return a.nfo(h - 1)
  870. case tagFreeLong:
  871. return a.nfo(h - b2h(buf[:]))
  872. }
  873. return
  874. }
  875. // Set h.prev = p
  876. func (a *Allocator) prev(h, p int64) (err error) {
  877. pb := buffer.Get(7)
  878. defer buffer.Put(pb)
  879. b := *pb
  880. off := h2off(h)
  881. if err = a.read(b[:1], off); err != nil {
  882. return
  883. }
  884. switch tag := b[0]; tag {
  885. default:
  886. return &ErrILSEQ{Type: ErrExpFreeTag, Off: off, Arg: int64(tag)}
  887. case tagFreeShort:
  888. off += 1
  889. case tagFreeLong:
  890. off += 8
  891. }
  892. return a.writeAt(h2b(b[:7], p), off)
  893. }
  894. // Set h.next = n
  895. func (a *Allocator) next(h, n int64) (err error) {
  896. pb := buffer.Get(7)
  897. defer buffer.Put(pb)
  898. b := *pb
  899. off := h2off(h)
  900. if err = a.read(b[:1], off); err != nil {
  901. return
  902. }
  903. switch tag := b[0]; tag {
  904. default:
  905. return &ErrILSEQ{Type: ErrExpFreeTag, Off: off, Arg: int64(tag)}
  906. case tagFreeShort:
  907. off += 8
  908. case tagFreeLong:
  909. off += 15
  910. }
  911. return a.writeAt(h2b(b[:7], n), off)
  912. }
  913. // Make the filer image @h a free block.
  914. func (a *Allocator) makeFree(h, atoms, prev, next int64) (err error) {
  915. pbuf := buffer.Get(22)
  916. defer buffer.Put(pbuf)
  917. buf := *pbuf
  918. switch {
  919. case atoms == 1:
  920. buf[0], buf[15] = tagFreeShort, tagFreeShort
  921. h2b(buf[1:], prev)
  922. h2b(buf[8:], next)
  923. if err = a.write(h2off(h), buf[:16]); err != nil {
  924. return
  925. }
  926. default:
  927. buf[0] = tagFreeLong
  928. h2b(buf[1:], atoms)
  929. h2b(buf[8:], prev)
  930. h2b(buf[15:], next)
  931. if err = a.write(h2off(h), buf[:22]); err != nil {
  932. return
  933. }
  934. h2b(buf[:], atoms)
  935. buf[7] = tagFreeLong
  936. if err = a.write(h2off(h+atoms)-8, buf[:8]); err != nil {
  937. return
  938. }
  939. }
  940. if prev != 0 {
  941. if err = a.next(prev, h); err != nil {
  942. return
  943. }
  944. }
  945. if next != 0 {
  946. err = a.prev(next, h)
  947. }
  948. return
  949. }
  950. func (a *Allocator) makeUsedBlock(dst []byte, b []byte) (w []byte, rqAtoms int, cc byte, err error) {
  951. cc = tagNotCompressed
  952. w = b
  953. var n int
  954. if n = len(b); n > maxRq {
  955. return nil, 0, 0, &ErrINVAL{"Allocator.makeUsedBlock: content size out of limits", n}
  956. }
  957. rqAtoms = n2atoms(n)
  958. if a.Compress && n > 14 { // attempt compression
  959. if dst, err = zappy.Encode(dst, b); err != nil {
  960. return
  961. }
  962. n2 := len(dst)
  963. if rqAtoms2 := n2atoms(n2); rqAtoms2 < rqAtoms { // compression saved at least a single atom
  964. w, n, rqAtoms, cc = dst, n2, rqAtoms2, tagCompressed
  965. }
  966. }
  967. return
  968. }
  969. func (a *Allocator) writeUsedBlock(h int64, cc byte, b []byte) (err error) {
  970. n := len(b)
  971. rq := n2atoms(n) << 4
  972. pbuf := buffer.Get(rq)
  973. defer buffer.Put(pbuf)
  974. buf := *pbuf
  975. switch n <= maxShort {
  976. case true:
  977. buf[0] = byte(n)
  978. copy(buf[1:], b)
  979. case false:
  980. m := n2m(n)
  981. buf[0], buf[1], buf[2] = tagUsedLong, byte(m>>8), byte(m)
  982. copy(buf[3:], b)
  983. }
  984. if p := n2padding(n); p != 0 {
  985. copy(buf[rq-1-p:], zeros[:])
  986. }
  987. buf[rq-1] = cc
  988. return a.writeAt(buf, h2off(h))
  989. }
  990. func (a *Allocator) verifyUnused(h, totalAtoms int64, tag byte, log func(error) bool, fast bool) (atoms, prev, next int64, err error) {
  991. switch tag {
  992. default:
  993. panic("internal error")
  994. case tagFreeShort:
  995. var b [16]byte
  996. off := h2off(h)
  997. if err = a.read(b[:], off); err != nil {
  998. return
  999. }
  1000. if b[15] != tagFreeShort {
  1001. err = &ErrILSEQ{Type: ErrShortFreeTailTag, Off: off, Arg: int64(b[15])}
  1002. log(err)
  1003. return
  1004. }
  1005. atoms, prev, next = 1, b2h(b[1:]), b2h(b[8:])
  1006. case tagFreeLong:
  1007. var b [22]byte
  1008. off := h2off(h)
  1009. if err = a.read(b[:], off); err != nil {
  1010. return
  1011. }
  1012. atoms, prev, next = b2h(b[1:]), b2h(b[8:]), b2h(b[15:])
  1013. if fast {
  1014. return
  1015. }
  1016. if atoms < 2 {
  1017. err = &ErrILSEQ{Type: ErrLongFreeBlkTooShort, Off: off, Arg: int64(atoms)}
  1018. break
  1019. }
  1020. if h+atoms-1 > totalAtoms {
  1021. err = &ErrILSEQ{Type: ErrLongFreeBlkTooLong, Off: off, Arg: atoms}
  1022. break
  1023. }
  1024. if prev > totalAtoms {
  1025. err = &ErrILSEQ{Type: ErrLongFreePrevBeyondEOF, Off: off, Arg: next}
  1026. break
  1027. }
  1028. if next > totalAtoms {
  1029. err = &ErrILSEQ{Type: ErrLongFreeNextBeyondEOF, Off: off, Arg: next}
  1030. break
  1031. }
  1032. toff := h2off(h+atoms) - 8
  1033. if err = a.read(b[:8], toff); err != nil {
  1034. return
  1035. }
  1036. if b[7] != tag {
  1037. err = &ErrILSEQ{Type: ErrLongFreeTailTag, Off: off, Arg: int64(b[7])}
  1038. break
  1039. }
  1040. if s2 := b2h(b[:]); s2 != atoms {
  1041. err = &ErrILSEQ{Type: ErrVerifyTailSize, Off: off, Arg: atoms, Arg2: s2}
  1042. break
  1043. }
  1044. }
  1045. if err != nil {
  1046. log(err)
  1047. }
  1048. return
  1049. }
  1050. func (a *Allocator) verifyUsed(h, totalAtoms int64, tag byte, buf, ubuf []byte, log func(error) bool, fast bool) (compressed bool, dlen int, atoms, link int64, err error) {
  1051. var (
  1052. padding int
  1053. doff int64
  1054. padZeros [15]byte
  1055. tailBuf [16]byte
  1056. )
  1057. switch tag {
  1058. default: // Short used
  1059. dlen = int(tag)
  1060. atoms = int64((dlen+1)/16) + 1
  1061. padding = 15 - (dlen+1)%16
  1062. doff = h2off(h) + 1
  1063. case tagUsedLong:
  1064. off := h2off(h) + 1
  1065. var b2 [2]byte
  1066. if err = a.read(b2[:], off); err != nil {
  1067. return
  1068. }
  1069. dlen = m2n(int(b2[0])<<8 | int(b2[1]))
  1070. atoms = int64((dlen+3)/16) + 1
  1071. padding = 15 - (dlen+3)%16
  1072. doff = h2off(h) + 3
  1073. case tagUsedRelocated:
  1074. dlen = 7
  1075. atoms = 1
  1076. padding = 7
  1077. doff = h2off(h) + 1
  1078. case tagFreeShort, tagFreeLong:
  1079. panic("internal error")
  1080. }
  1081. if fast {
  1082. if tag == tagUsedRelocated {
  1083. dlen = 0
  1084. if err = a.read(buf[:7], doff); err != nil {
  1085. return
  1086. }
  1087. link = b2h(buf)
  1088. }
  1089. return false, dlen, atoms, link, nil
  1090. }
  1091. if ok := h+atoms-1 <= totalAtoms; !ok { // invalid last block
  1092. err = &ErrILSEQ{Type: ErrVerifyUsedSpan, Off: h2off(h), Arg: atoms}
  1093. log(err)
  1094. return
  1095. }
  1096. tailsz := 1 + padding
  1097. off := h2off(h) + 16*atoms - int64(tailsz)
  1098. if err = a.read(tailBuf[:tailsz], off); err != nil {
  1099. return false, 0, 0, 0, err
  1100. }
  1101. if ok := bytes.Equal(padZeros[:padding], tailBuf[:padding]); !ok {
  1102. err = &ErrILSEQ{Type: ErrVerifyPadding, Off: h2off(h)}
  1103. log(err)
  1104. return
  1105. }
  1106. var cc byte
  1107. switch cc = tailBuf[padding]; cc {
  1108. default:
  1109. err = &ErrILSEQ{Type: ErrTailTag, Off: h2off(h)}
  1110. log(err)
  1111. return
  1112. case tagCompressed:
  1113. compressed = true
  1114. if tag == tagUsedRelocated {
  1115. err = &ErrILSEQ{Type: ErrTailTag, Off: h2off(h)}
  1116. log(err)
  1117. return
  1118. }
  1119. fallthrough
  1120. case tagNotCompressed:
  1121. if err = a.read(buf[:dlen], doff); err != nil {
  1122. return false, 0, 0, 0, err
  1123. }
  1124. }
  1125. if cc == tagCompressed {
  1126. if ubuf, err = zappy.Decode(ubuf, buf[:dlen]); err != nil || len(ubuf) > maxRq {
  1127. err = &ErrILSEQ{Type: ErrDecompress, Off: h2off(h)}
  1128. log(err)
  1129. return
  1130. }
  1131. dlen = len(ubuf)
  1132. }
  1133. if tag == tagUsedRelocated {
  1134. link = b2h(buf)
  1135. if link == 0 {
  1136. err = &ErrILSEQ{Type: ErrNullReloc, Off: h2off(h)}
  1137. log(err)
  1138. return
  1139. }
  1140. if link > totalAtoms { // invalid last block
  1141. err = &ErrILSEQ{Type: ErrRelocBeyondEOF, Off: h2off(h), Arg: link}
  1142. log(err)
  1143. return
  1144. }
  1145. }
  1146. return
  1147. }
  1148. var nolog = func(error) bool { return false }
  1149. // Verify attempts to find any structural errors in a Filer wrt the
  1150. // organization of it as defined by Allocator. 'bitmap' is a scratch pad for
  1151. // necessary bookkeeping and will grow to at most to Allocator's
  1152. // Filer.Size()/128 (0,78%). Any problems found are reported to 'log' except
  1153. // non verify related errors like disk read fails etc. If 'log' returns false
  1154. // or the error doesn't allow to (reliably) continue, the verification process
  1155. // is stopped and an error is returned from the Verify function. Passing a nil
  1156. // log works like providing a log function always returning false. Any
  1157. // non-structural errors, like for instance Filer read errors, are NOT reported
  1158. // to 'log', but returned as the Verify's return value, because Verify cannot
  1159. // proceed in such cases. Verify returns nil only if it fully completed
  1160. // verifying Allocator's Filer without detecting any error.
  1161. //
  1162. // It is recommended to limit the number reported problems by returning false
  1163. // from 'log' after reaching some limit. Huge and corrupted DB can produce an
  1164. // overwhelming error report dataset.
  1165. //
  1166. // The verifying process will scan the whole DB at least 3 times (a trade
  1167. // between processing space and time consumed). It doesn't read the content of
  1168. // free blocks above the head/tail info bytes. If the 3rd phase detects lost
  1169. // free space, then a 4th scan (a faster one) is performed to precisely report
  1170. // all of them.
  1171. //
  1172. // If the DB/Filer to be verified is reasonably small, respective if its
  1173. // size/128 can comfortably fit within process's free memory, then it is
  1174. // recommended to consider using a MemFiler for the bit map.
  1175. //
  1176. // Statistics are returned via 'stats' if non nil. The statistics are valid
  1177. // only if Verify succeeded, ie. it didn't reported anything to log and it
  1178. // returned a nil error.
  1179. func (a *Allocator) Verify(bitmap Filer, log func(error) bool, stats *AllocStats) (err error) {
  1180. if log == nil {
  1181. log = nolog
  1182. }
  1183. n, err := bitmap.Size()
  1184. if err != nil {
  1185. return
  1186. }
  1187. if n != 0 {
  1188. return &ErrINVAL{"Allocator.Verify: bit map initial size non zero (%d)", n}
  1189. }
  1190. var bits int64
  1191. bitMask := [8]byte{1, 2, 4, 8, 16, 32, 64, 128}
  1192. byteBuf := []byte{0}
  1193. //DONE
  1194. // +performance, this implementation is hopefully correct but _very_
  1195. // naive, probably good as a prototype only. Use maybe a MemFiler
  1196. // "cache" etc.
  1197. // ----
  1198. // Turns out the OS caching is as effective as it can probably get.
  1199. bit := func(on bool, h int64) (wasOn bool, err error) {
  1200. m := bitMask[h&7]
  1201. off := h >> 3
  1202. var v byte
  1203. sz, err := bitmap.Size()
  1204. if err != nil {
  1205. return
  1206. }
  1207. if off < sz {
  1208. if n, err := bitmap.ReadAt(byteBuf, off); n != 1 {
  1209. return false, &ErrILSEQ{Type: ErrOther, Off: off, More: fmt.Errorf("Allocator.Verify - reading bitmap: %s", err)}
  1210. }
  1211. v = byteBuf[0]
  1212. }
  1213. switch wasOn = v&m != 0; on {
  1214. case true:
  1215. if !wasOn {
  1216. v |= m
  1217. bits++
  1218. }
  1219. case false:
  1220. if wasOn {
  1221. v ^= m
  1222. bits--
  1223. }
  1224. }
  1225. byteBuf[0] = v
  1226. if n, err := bitmap.WriteAt(byteBuf, off); n != 1 || err != nil {
  1227. return false, &ErrILSEQ{Type: ErrOther, Off: off, More: fmt.Errorf("Allocator.Verify - writing bitmap: %s", err)}
  1228. }
  1229. return
  1230. }
  1231. // Phase 1 - sequentially scan a.f to reliably determine block
  1232. // boundaries. Set a bit for every block start.
  1233. var (
  1234. buf, ubuf [maxRq]byte
  1235. prevH, h, atoms int64
  1236. wasOn bool
  1237. tag byte
  1238. st = AllocStats{
  1239. AllocMap: map[int64]int64{},
  1240. FreeMap: map[int64]int64{},
  1241. }
  1242. dlen int
  1243. )
  1244. fsz, err := a.f.Size()
  1245. if err != nil {
  1246. return
  1247. }
  1248. ok := fsz%16 == 0
  1249. totalAtoms := (fsz - fltSz) / atomLen
  1250. if !ok {
  1251. err = &ErrILSEQ{Type: ErrFileSize, Name: a.f.Name(), Arg: fsz}
  1252. log(err)
  1253. return
  1254. }
  1255. st.TotalAtoms = totalAtoms
  1256. prevTag := -1
  1257. lastH := int64(-1)
  1258. for h = 1; h <= totalAtoms; h += atoms {
  1259. prevH = h // For checking last block == used
  1260. off := h2off(h)
  1261. if err = a.read(buf[:1], off); err != nil {
  1262. return
  1263. }
  1264. switch tag = buf[0]; tag {
  1265. default: // Short used
  1266. fallthrough
  1267. case tagUsedLong, tagUsedRelocated:
  1268. var compressed bool
  1269. if compressed, dlen, atoms, _, err = a.verifyUsed(h, totalAtoms, tag, buf[:], ubuf[:], log, false); err != nil {
  1270. return
  1271. }
  1272. if compressed {
  1273. st.Compression++
  1274. }
  1275. st.AllocAtoms += atoms
  1276. switch {
  1277. case tag == tagUsedRelocated:
  1278. st.AllocMap[1]++
  1279. st.Relocations++
  1280. default:
  1281. st.AllocMap[atoms]++
  1282. st.AllocBytes += int64(dlen)
  1283. st.Handles++
  1284. }
  1285. case tagFreeShort, tagFreeLong:
  1286. if prevTag == tagFreeShort || prevTag == tagFreeLong {
  1287. err = &ErrILSEQ{Type: ErrAdjacentFree, Off: h2off(lastH), Arg: off}
  1288. log(err)
  1289. return
  1290. }
  1291. if atoms, _, _, err = a.verifyUnused(h, totalAtoms, tag, log, false); err != nil {
  1292. return
  1293. }
  1294. st.FreeMap[atoms]++
  1295. st.FreeAtoms += atoms
  1296. }
  1297. if wasOn, err = bit(true, h); err != nil {
  1298. return
  1299. }
  1300. if wasOn {
  1301. panic("internal error")
  1302. }
  1303. prevTag = int(tag)
  1304. lastH = h
  1305. }
  1306. if totalAtoms != 0 && (tag == tagFreeShort || tag == tagFreeLong) {
  1307. err = &ErrILSEQ{Type: ErrFreeTailBlock, Off: h2off(prevH)}
  1308. log(err)
  1309. return
  1310. }
  1311. // Phase 2 - check used blocks, turn off the map bit for every used
  1312. // block.
  1313. for h = 1; h <= totalAtoms; h += atoms {
  1314. off := h2off(h)
  1315. if err = a.read(buf[:1], off); err != nil {
  1316. return
  1317. }
  1318. var link int64
  1319. switch tag = buf[0]; tag {
  1320. default: // Short used
  1321. fallthrough
  1322. case tagUsedLong, tagUsedRelocated:
  1323. if _, _, atoms, link, err = a.verifyUsed(h, totalAtoms, tag, buf[:], ubuf[:], log, true); err != nil {
  1324. return
  1325. }
  1326. case tagFreeShort, tagFreeLong:
  1327. if atoms, _, _, err = a.verifyUnused(h, totalAtoms, tag, log, true); err != nil {
  1328. return
  1329. }
  1330. }
  1331. turnoff := true
  1332. switch tag {
  1333. case tagUsedRelocated:
  1334. if err = a.read(buf[:1], h2off(link)); err != nil {
  1335. return
  1336. }
  1337. switch linkedTag := buf[0]; linkedTag {
  1338. case tagFreeShort, tagFreeLong, tagUsedRelocated:
  1339. err = &ErrILSEQ{Type: ErrInvalidRelocTarget, Off: off, Arg: link}
  1340. log(err)
  1341. return
  1342. }
  1343. case tagFreeShort, tagFreeLong:
  1344. turnoff = false
  1345. }
  1346. if !turnoff {
  1347. continue
  1348. }
  1349. if wasOn, err = bit(false, h); err != nil {
  1350. return
  1351. }
  1352. if !wasOn {
  1353. panic("internal error")
  1354. }
  1355. }
  1356. // Phase 3 - using the flt check heads link to proper free blocks. For
  1357. // every free block, walk the list, verify the {next, prev} links and
  1358. // turn the respective map bit off. After processing all free lists,
  1359. // the map bits count should be zero. Otherwise there are "lost" free
  1360. // blocks.
  1361. var prev, next, fprev, fnext int64
  1362. rep := a.flt
  1363. for _, list := range rep {
  1364. prev, next = 0, list.head
  1365. for ; next != 0; prev, next = next, fnext {
  1366. if wasOn, err = bit(false, next); err != nil {
  1367. return
  1368. }
  1369. if !wasOn {
  1370. err = &ErrILSEQ{Type: ErrFLT, Off: h2off(next), Arg: h}
  1371. log(err)
  1372. return
  1373. }
  1374. off := h2off(next)
  1375. if err = a.read(buf[:1], off); err != nil {
  1376. return
  1377. }
  1378. switch tag = buf[0]; tag {
  1379. default:
  1380. panic("internal error")
  1381. case tagFreeShort, tagFreeLong:
  1382. if atoms, fprev, fnext, err = a.verifyUnused(next, totalAtoms, tag, log, true); err != nil {
  1383. return
  1384. }
  1385. if min := list.minSize; atoms < min {
  1386. err = &ErrILSEQ{Type: ErrFLTSize, Off: h2off(next), Arg: atoms, Arg2: min}
  1387. log(err)
  1388. return
  1389. }
  1390. if fprev != prev {
  1391. err = &ErrILSEQ{Type: ErrFreeChaining, Off: h2off(next)}
  1392. log(err)
  1393. return
  1394. }
  1395. }
  1396. }
  1397. }
  1398. if bits == 0 { // Verify succeeded
  1399. if stats != nil {
  1400. *stats = st
  1401. }
  1402. return
  1403. }
  1404. // Phase 4 - if after phase 3 there are lost free blocks, report all of
  1405. // them to 'log'
  1406. for i := range ubuf { // setup zeros for compares
  1407. ubuf[i] = 0
  1408. }
  1409. var off, lh int64
  1410. rem, err := bitmap.Size()
  1411. if err != nil {
  1412. return err
  1413. }
  1414. for rem != 0 {
  1415. rq := int(mathutil.MinInt64(64*1024, rem))
  1416. var n int
  1417. if n, err = bitmap.ReadAt(buf[:rq], off); n != rq {
  1418. return &ErrILSEQ{Type: ErrOther, Off: off, More: fmt.Errorf("bitmap ReadAt(size %d, off %#x): %s", rq, off, err)}
  1419. }
  1420. if !bytes.Equal(buf[:rq], ubuf[:rq]) {
  1421. for d, v := range buf[:rq] {
  1422. if v != 0 {
  1423. for i, m := range bitMask {
  1424. if v&m != 0 {
  1425. lh = 8*(off+int64(d)) + int64(i)
  1426. err = &ErrILSEQ{Type: ErrLostFreeBlock, Off: h2off(lh)}
  1427. log(err)
  1428. return
  1429. }
  1430. }
  1431. }
  1432. }
  1433. }
  1434. off += int64(rq)
  1435. rem -= int64(rq)
  1436. }
  1437. return
  1438. }
  1439. type fltSlot struct {
  1440. head int64
  1441. minSize int64
  1442. }
  1443. func (f fltSlot) String() string {
  1444. return fmt.Sprintf("head %#x, minSize %#x\n", f.head, f.minSize)
  1445. }
  1446. type flt [14]fltSlot
  1447. func (f *flt) init() {
  1448. sz := 1
  1449. for i := range *f {
  1450. f[i].minSize, f[i].head = int64(sz), 0
  1451. sz <<= 1
  1452. }
  1453. f[13].minSize = 4112
  1454. }
  1455. func (f *flt) load(fi Filer, off int64) (err error) {
  1456. pb := buffer.Get(fltSz)
  1457. defer buffer.Put(pb)
  1458. b := *pb
  1459. if _, err = fi.ReadAt(b[:], off); err != nil {
  1460. return
  1461. }
  1462. for i := range *f {
  1463. off := 8*i + 1
  1464. f[i].head = b2h(b[off:])
  1465. }
  1466. return
  1467. }
  1468. func (f *flt) find(rq int) (h int64) {
  1469. switch {
  1470. case rq < 1:
  1471. panic(rq)
  1472. case rq >= maxFLTRq:
  1473. h, f[13].head = f[13].head, 0
  1474. return
  1475. default:
  1476. g := f[mathutil.Log2Uint16(uint16(rq)):]
  1477. for i := range g {
  1478. p := &g[i]
  1479. if rq <= int(p.minSize) {
  1480. if h = p.head; h != 0 {
  1481. p.head = 0
  1482. return
  1483. }
  1484. }
  1485. }
  1486. return
  1487. }
  1488. }
  1489. func (f *flt) head(atoms int64) (h int64) {
  1490. switch {
  1491. case atoms < 1:
  1492. panic(atoms)
  1493. case atoms >= maxFLTRq:
  1494. return f[13].head
  1495. default:
  1496. lg := mathutil.Log2Uint16(uint16(atoms))
  1497. g := f[lg:]
  1498. for i := range g {
  1499. if atoms < g[i+1].minSize {
  1500. return g[i].head
  1501. }
  1502. }
  1503. panic("internal error")
  1504. }
  1505. }
  1506. func (f *flt) setHead(h, atoms int64, fi Filer) (err error) {
  1507. switch {
  1508. case atoms < 1:
  1509. panic(atoms)
  1510. case atoms >= maxFLTRq:
  1511. pb := buffer.Get(7)
  1512. defer buffer.Put(pb)
  1513. b := *pb
  1514. if _, err = fi.WriteAt(h2b(b[:], h), 8*13+1); err != nil {
  1515. return
  1516. }
  1517. f[13].head = h
  1518. return
  1519. default:
  1520. lg := mathutil.Log2Uint16(uint16(atoms))
  1521. g := f[lg:]
  1522. for i := range f {
  1523. if atoms < g[i+1].minSize {
  1524. pb := buffer.Get(7)
  1525. defer buffer.Put(pb)
  1526. b := *pb
  1527. if _, err = fi.WriteAt(h2b(b[:], h), 8*int64(i+lg)+1); err != nil {
  1528. return
  1529. }
  1530. g[i].head = h
  1531. return
  1532. }
  1533. }
  1534. panic("internal error")
  1535. }
  1536. }
  1537. func (f *flt) String() string {
  1538. a := []string{}
  1539. for i, v := range *f {
  1540. a = append(a, fmt.Sprintf("[%2d] %s", i, v))
  1541. }
  1542. return strings.Join(a, "")
  1543. }
  1544. type node struct {
  1545. b []byte
  1546. h int64
  1547. prev, next *node
  1548. }
  1549. type cache []*node
  1550. func (c *cache) get(n int) *node {
  1551. r, _ := c.get2(n)
  1552. return r
  1553. }
  1554. func (c *cache) get2(n int) (r *node, isZeroed bool) {
  1555. s := *c
  1556. lens := len(s)
  1557. if lens == 0 {
  1558. return &node{b: make([]byte, n, mathutil.Min(2*n, maxBuf))}, true
  1559. }
  1560. i := sort.Search(lens, func(x int) bool { return len(s[x].b) >= n })
  1561. if i == lens {
  1562. i--
  1563. s[i].b, isZeroed = make([]byte, n, mathutil.Min(2*n, maxBuf)), true
  1564. }
  1565. r = s[i]
  1566. r.b = r.b[:n]
  1567. copy(s[i:], s[i+1:])
  1568. s = s[:lens-1]
  1569. *c = s
  1570. return
  1571. }
  1572. func (c *cache) cget(n int) (r *node) {
  1573. r, ok := c.get2(n)
  1574. if ok {
  1575. return
  1576. }
  1577. for i := range r.b {
  1578. r.b[i] = 0
  1579. }
  1580. return
  1581. }
  1582. func (c *cache) size() (sz int64) {
  1583. for _, n := range *c {
  1584. sz += int64(cap(n.b))
  1585. }
  1586. return
  1587. }
  1588. func (c *cache) put(n *node) *node {
  1589. s := *c
  1590. n.b = n.b[:cap(n.b)]
  1591. lenb := len(n.b)
  1592. lens := len(s)
  1593. i := sort.Search(lens, func(x int) bool { return len(s[x].b) >= lenb })
  1594. s = append(s, nil)
  1595. copy(s[i+1:], s[i:])
  1596. s[i] = n
  1597. *c = s
  1598. return n
  1599. }
  1600. type lst struct {
  1601. front, back *node
  1602. }
  1603. func (l *lst) pushFront(n *node) *node {
  1604. if l.front == nil {
  1605. l.front, l.back, n.prev, n.next = n, n, nil, nil
  1606. return n
  1607. }
  1608. n.prev, n.next, l.front.prev, l.front = nil, l.front, n, n
  1609. return n
  1610. }
  1611. func (l *lst) remove(n *node) *node {
  1612. if n.prev == nil {
  1613. l.front = n.next
  1614. } else {
  1615. n.prev.next = n.next
  1616. }
  1617. if n.next == nil {
  1618. l.back = n.prev
  1619. } else {
  1620. n.next.prev = n.prev
  1621. }
  1622. n.prev, n.next = nil, nil
  1623. return n
  1624. }
  1625. func (l *lst) removeBack() *node {
  1626. return l.remove(l.back)
  1627. }
  1628. func (l *lst) moveToFront(n *node) *node {
  1629. return l.pushFront(l.remove(n))
  1630. }
  1631. func (l *lst) size() (sz int64) {
  1632. for n := l.front; n != nil; n = n.next {
  1633. sz += int64(cap(n.b))
  1634. }
  1635. return
  1636. }
  1637. func cacheAudit(m map[int64]*node, l *lst) (err error) {
  1638. cnt := 0
  1639. for h, n := range m {
  1640. if g, e := n.h, h; g != e {
  1641. return fmt.Errorf("cacheAudit: invalid node handle %d != %d", g, e)
  1642. }
  1643. if cnt, err = l.audit(n, true); err != nil {
  1644. return
  1645. }
  1646. }
  1647. if g, e := cnt, len(m); g != e {
  1648. return fmt.Errorf("cacheAudit: invalid cache size %d != %d", g, e)
  1649. }
  1650. return
  1651. }
  1652. func (l *lst) audit(n *node, onList bool) (cnt int, err error) {
  1653. if !onList && (n.prev != nil || n.next != nil) {
  1654. return -1, fmt.Errorf("lst.audit: free node with non nil linkage")
  1655. }
  1656. if l.front == nil && l.back != nil || l.back == nil && l.front != nil {
  1657. return -1, fmt.Errorf("lst.audit: one of .front/.back is nil while the other is non nil")
  1658. }
  1659. if l.front == l.back && l.front != nil {
  1660. x := l.front
  1661. if x.prev != nil || x.next != nil {
  1662. return -1, fmt.Errorf("lst.audit: single node has non nil linkage")
  1663. }
  1664. if onList && x != n {
  1665. return -1, fmt.Errorf("lst.audit: single node is alien")
  1666. }
  1667. }
  1668. seen := false
  1669. var prev *node
  1670. x := l.front
  1671. for x != nil {
  1672. cnt++
  1673. if x.prev != prev {
  1674. return -1, fmt.Errorf("lst.audit: broken .prev linkage")
  1675. }
  1676. if x == n {
  1677. seen = true
  1678. }
  1679. prev = x
  1680. x = x.next
  1681. }
  1682. if prev != l.back {
  1683. return -1, fmt.Errorf("lst.audit: broken .back linkage")
  1684. }
  1685. if onList && !seen {
  1686. return -1, fmt.Errorf("lst.audit: node missing in list")
  1687. }
  1688. if !onList && seen {
  1689. return -1, fmt.Errorf("lst.audit: node should not be on the list")
  1690. }
  1691. return
  1692. }