weakhash.go 3.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169
  1. // Copyright (C) 2016 The Syncthing Authors.
  2. //
  3. // This Source Code Form is subject to the terms of the Mozilla Public
  4. // License, v. 2.0. If a copy of the MPL was not distributed with this file,
  5. // You can obtain one at http://mozilla.org/MPL/2.0/.
  6. package weakhash
  7. import (
  8. "bufio"
  9. "hash"
  10. "io"
  11. "os"
  12. )
  13. const (
  14. Size = 4
  15. )
  16. func NewHash(size int) hash.Hash32 {
  17. return &digest{
  18. buf: make([]byte, size),
  19. size: size,
  20. }
  21. }
  22. // Find finds all the blocks of the given size within io.Reader that matches
  23. // the hashes provided, and returns a hash -> slice of offsets within reader
  24. // map, that produces the same weak hash.
  25. func Find(ir io.Reader, hashesToFind []uint32, size int) (map[uint32][]int64, error) {
  26. if ir == nil {
  27. return nil, nil
  28. }
  29. r := bufio.NewReader(ir)
  30. hf := NewHash(size)
  31. n, err := io.CopyN(hf, r, int64(size))
  32. if err == io.EOF {
  33. return nil, nil
  34. }
  35. if err != nil {
  36. return nil, err
  37. }
  38. if n != int64(size) {
  39. return nil, io.ErrShortBuffer
  40. }
  41. offsets := make(map[uint32][]int64)
  42. for _, hashToFind := range hashesToFind {
  43. offsets[hashToFind] = nil
  44. }
  45. var i int64
  46. var hash uint32
  47. for {
  48. hash = hf.Sum32()
  49. if existing, ok := offsets[hash]; ok {
  50. offsets[hash] = append(existing, i)
  51. }
  52. i++
  53. bt, err := r.ReadByte()
  54. if err == io.EOF {
  55. break
  56. } else if err != nil {
  57. return offsets, err
  58. }
  59. hf.Write([]byte{bt})
  60. }
  61. return offsets, nil
  62. }
  63. // Using this: http://tutorials.jenkov.com/rsync/checksums.html
  64. // Example implementations: https://gist.github.com/csabahenk/1096262/revisions
  65. // Alternative that could be used is adler32 http://blog.liw.fi/posts/rsync-in-python/#comment-fee8d5e07794fdba3fe2d76aa2706a13
  66. type digest struct {
  67. buf []byte
  68. size int
  69. a uint16
  70. b uint16
  71. j int
  72. }
  73. func (d *digest) Write(data []byte) (int, error) {
  74. for _, c := range data {
  75. // TODO: Use this in Go 1.6
  76. // d.a = d.a - uint16(d.buf[d.j]) + uint16(c)
  77. // d.b = d.b - uint16(d.size)*uint16(d.buf[d.j]) + d.a
  78. d.a -= uint16(d.buf[d.j])
  79. d.a += uint16(c)
  80. d.b -= uint16(d.size) * uint16(d.buf[d.j])
  81. d.b += d.a
  82. d.buf[d.j] = c
  83. d.j = (d.j + 1) % d.size
  84. }
  85. return len(data), nil
  86. }
  87. func (d *digest) Reset() {
  88. for i := range d.buf {
  89. d.buf[i] = 0x0
  90. }
  91. d.a = 0
  92. d.b = 0
  93. d.j = 0
  94. }
  95. func (d *digest) Sum(b []byte) []byte {
  96. r := d.Sum32()
  97. return append(b, byte(r>>24), byte(r>>16), byte(r>>8), byte(r))
  98. }
  99. func (d *digest) Sum32() uint32 { return uint32(d.a) | (uint32(d.b) << 16) }
  100. func (digest) Size() int { return Size }
  101. func (digest) BlockSize() int { return 1 }
  102. func NewFinder(path string, size int, hashesToFind []uint32) (*Finder, error) {
  103. file, err := os.Open(path)
  104. if err != nil {
  105. return nil, err
  106. }
  107. offsets, err := Find(file, hashesToFind, size)
  108. if err != nil {
  109. file.Close()
  110. return nil, err
  111. }
  112. return &Finder{
  113. file: file,
  114. size: size,
  115. offsets: offsets,
  116. }, nil
  117. }
  118. type Finder struct {
  119. file *os.File
  120. size int
  121. offsets map[uint32][]int64
  122. }
  123. // Iterate iterates all available blocks that matches the provided hash, reads
  124. // them into buf, and calls the iterator function. The iterator function should
  125. // return wether it wishes to continue interating.
  126. func (h *Finder) Iterate(hash uint32, buf []byte, iterFunc func(int64) bool) (bool, error) {
  127. if h == nil || hash == 0 || len(buf) != h.size {
  128. return false, nil
  129. }
  130. for _, offset := range h.offsets[hash] {
  131. _, err := h.file.ReadAt(buf, offset)
  132. if err != nil {
  133. return false, err
  134. }
  135. if !iterFunc(offset) {
  136. return true, nil
  137. }
  138. }
  139. return false, nil
  140. }
  141. // Close releases any resource associated with the finder
  142. func (h *Finder) Close() {
  143. if h != nil {
  144. h.file.Close()
  145. }
  146. }