1
0

mph_matcher.go 6.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304
  1. package strmatcher
  2. import (
  3. "math/bits"
  4. "regexp"
  5. "sort"
  6. "strings"
  7. "unsafe"
  8. )
  9. // PrimeRK is the prime base used in Rabin-Karp algorithm.
  10. const PrimeRK = 16777619
  11. // calculate the rolling murmurHash of given string
  12. func RollingHash(s string) uint32 {
  13. h := uint32(0)
  14. for i := len(s) - 1; i >= 0; i-- {
  15. h = h*PrimeRK + uint32(s[i])
  16. }
  17. return h
  18. }
  19. // A MphMatcherGroup is divided into three parts:
  20. // 1. `full` and `domain` patterns are matched by Rabin-Karp algorithm and minimal perfect hash table;
  21. // 2. `substr` patterns are matched by ac automaton;
  22. // 3. `regex` patterns are matched with the regex library.
  23. type MphMatcherGroup struct {
  24. ac *ACAutomaton
  25. otherMatchers []matcherEntry
  26. rules []string
  27. level0 []uint32
  28. level0Mask int
  29. level1 []uint32
  30. level1Mask int
  31. count uint32
  32. ruleMap *map[string]uint32
  33. }
  34. func (g *MphMatcherGroup) AddFullOrDomainPattern(pattern string, t Type) {
  35. h := RollingHash(pattern)
  36. switch t {
  37. case Domain:
  38. (*g.ruleMap)["."+pattern] = h*PrimeRK + uint32('.')
  39. fallthrough
  40. case Full:
  41. (*g.ruleMap)[pattern] = h
  42. default:
  43. }
  44. }
  45. func NewMphMatcherGroup() *MphMatcherGroup {
  46. return &MphMatcherGroup{
  47. ac: nil,
  48. otherMatchers: nil,
  49. rules: nil,
  50. level0: nil,
  51. level0Mask: 0,
  52. level1: nil,
  53. level1Mask: 0,
  54. count: 1,
  55. ruleMap: &map[string]uint32{},
  56. }
  57. }
  58. // AddPattern adds a pattern to MphMatcherGroup
  59. func (g *MphMatcherGroup) AddPattern(pattern string, t Type) (uint32, error) {
  60. switch t {
  61. case Substr:
  62. if g.ac == nil {
  63. g.ac = NewACAutomaton()
  64. }
  65. g.ac.Add(pattern, t)
  66. case Full, Domain:
  67. pattern = strings.ToLower(pattern)
  68. g.AddFullOrDomainPattern(pattern, t)
  69. case Regex:
  70. r, err := regexp.Compile(pattern)
  71. if err != nil {
  72. return 0, err
  73. }
  74. g.otherMatchers = append(g.otherMatchers, matcherEntry{
  75. m: &regexMatcher{pattern: r},
  76. id: g.count,
  77. })
  78. default:
  79. panic("Unknown type")
  80. }
  81. return g.count, nil
  82. }
  83. // Build builds a minimal perfect hash table and ac automaton from insert rules
  84. func (g *MphMatcherGroup) Build() {
  85. if g.ac != nil {
  86. g.ac.Build()
  87. }
  88. keyLen := len(*g.ruleMap)
  89. if keyLen == 0 {
  90. keyLen = 1
  91. (*g.ruleMap)["empty___"] = RollingHash("empty___")
  92. }
  93. g.level0 = make([]uint32, nextPow2(keyLen/4))
  94. g.level0Mask = len(g.level0) - 1
  95. g.level1 = make([]uint32, nextPow2(keyLen))
  96. g.level1Mask = len(g.level1) - 1
  97. sparseBuckets := make([][]int, len(g.level0))
  98. var ruleIdx int
  99. for rule, hash := range *g.ruleMap {
  100. n := int(hash) & g.level0Mask
  101. g.rules = append(g.rules, rule)
  102. sparseBuckets[n] = append(sparseBuckets[n], ruleIdx)
  103. ruleIdx++
  104. }
  105. g.ruleMap = nil
  106. var buckets []indexBucket
  107. for n, vals := range sparseBuckets {
  108. if len(vals) > 0 {
  109. buckets = append(buckets, indexBucket{n, vals})
  110. }
  111. }
  112. sort.Sort(bySize(buckets))
  113. occ := make([]bool, len(g.level1))
  114. var tmpOcc []int
  115. for _, bucket := range buckets {
  116. seed := uint32(0)
  117. for {
  118. findSeed := true
  119. tmpOcc = tmpOcc[:0]
  120. for _, i := range bucket.vals {
  121. n := int(strhashFallback(unsafe.Pointer(&g.rules[i]), uintptr(seed))) & g.level1Mask
  122. if occ[n] {
  123. for _, n := range tmpOcc {
  124. occ[n] = false
  125. }
  126. seed++
  127. findSeed = false
  128. break
  129. }
  130. occ[n] = true
  131. tmpOcc = append(tmpOcc, n)
  132. g.level1[n] = uint32(i)
  133. }
  134. if findSeed {
  135. g.level0[bucket.n] = seed
  136. break
  137. }
  138. }
  139. }
  140. }
  141. func nextPow2(v int) int {
  142. if v <= 1 {
  143. return 1
  144. }
  145. const MaxUInt = ^uint(0)
  146. n := (MaxUInt >> bits.LeadingZeros(uint(v))) + 1
  147. return int(n)
  148. }
  149. // Lookup searches for s in t and returns its index and whether it was found.
  150. func (g *MphMatcherGroup) Lookup(h uint32, s string) bool {
  151. i0 := int(h) & g.level0Mask
  152. seed := g.level0[i0]
  153. i1 := int(strhashFallback(unsafe.Pointer(&s), uintptr(seed))) & g.level1Mask
  154. n := g.level1[i1]
  155. return s == g.rules[int(n)]
  156. }
  157. // Match implements IndexMatcher.Match.
  158. func (g *MphMatcherGroup) Match(pattern string) []uint32 {
  159. result := []uint32{}
  160. hash := uint32(0)
  161. for i := len(pattern) - 1; i >= 0; i-- {
  162. hash = hash*PrimeRK + uint32(pattern[i])
  163. if pattern[i] == '.' {
  164. if g.Lookup(hash, pattern[i:]) {
  165. result = append(result, 1)
  166. return result
  167. }
  168. }
  169. }
  170. if g.Lookup(hash, pattern) {
  171. result = append(result, 1)
  172. return result
  173. }
  174. if g.ac != nil && g.ac.Match(pattern) {
  175. result = append(result, 1)
  176. return result
  177. }
  178. for _, e := range g.otherMatchers {
  179. if e.m.Match(pattern) {
  180. result = append(result, e.id)
  181. return result
  182. }
  183. }
  184. return nil
  185. }
  186. type indexBucket struct {
  187. n int
  188. vals []int
  189. }
  190. type bySize []indexBucket
  191. func (s bySize) Len() int { return len(s) }
  192. func (s bySize) Less(i, j int) bool { return len(s[i].vals) > len(s[j].vals) }
  193. func (s bySize) Swap(i, j int) { s[i], s[j] = s[j], s[i] }
  194. type stringStruct struct {
  195. str unsafe.Pointer
  196. len int
  197. }
  198. func strhashFallback(a unsafe.Pointer, h uintptr) uintptr {
  199. x := (*stringStruct)(a)
  200. return memhashFallback(x.str, h, uintptr(x.len))
  201. }
  202. const (
  203. // Constants for multiplication: four random odd 64-bit numbers.
  204. m1 = 16877499708836156737
  205. m2 = 2820277070424839065
  206. m3 = 9497967016996688599
  207. m4 = 15839092249703872147
  208. )
  209. var hashkey = [4]uintptr{1, 1, 1, 1}
  210. func memhashFallback(p unsafe.Pointer, seed, s uintptr) uintptr {
  211. h := uint64(seed + s*hashkey[0])
  212. tail:
  213. switch {
  214. case s == 0:
  215. case s < 4:
  216. h ^= uint64(*(*byte)(p))
  217. h ^= uint64(*(*byte)(add(p, s>>1))) << 8
  218. h ^= uint64(*(*byte)(add(p, s-1))) << 16
  219. h = rotl31(h*m1) * m2
  220. case s <= 8:
  221. h ^= uint64(readUnaligned32(p))
  222. h ^= uint64(readUnaligned32(add(p, s-4))) << 32
  223. h = rotl31(h*m1) * m2
  224. case s <= 16:
  225. h ^= readUnaligned64(p)
  226. h = rotl31(h*m1) * m2
  227. h ^= readUnaligned64(add(p, s-8))
  228. h = rotl31(h*m1) * m2
  229. case s <= 32:
  230. h ^= readUnaligned64(p)
  231. h = rotl31(h*m1) * m2
  232. h ^= readUnaligned64(add(p, 8))
  233. h = rotl31(h*m1) * m2
  234. h ^= readUnaligned64(add(p, s-16))
  235. h = rotl31(h*m1) * m2
  236. h ^= readUnaligned64(add(p, s-8))
  237. h = rotl31(h*m1) * m2
  238. default:
  239. v1 := h
  240. v2 := uint64(seed * hashkey[1])
  241. v3 := uint64(seed * hashkey[2])
  242. v4 := uint64(seed * hashkey[3])
  243. for s >= 32 {
  244. v1 ^= readUnaligned64(p)
  245. v1 = rotl31(v1*m1) * m2
  246. p = add(p, 8)
  247. v2 ^= readUnaligned64(p)
  248. v2 = rotl31(v2*m2) * m3
  249. p = add(p, 8)
  250. v3 ^= readUnaligned64(p)
  251. v3 = rotl31(v3*m3) * m4
  252. p = add(p, 8)
  253. v4 ^= readUnaligned64(p)
  254. v4 = rotl31(v4*m4) * m1
  255. p = add(p, 8)
  256. s -= 32
  257. }
  258. h = v1 ^ v2 ^ v3 ^ v4
  259. goto tail
  260. }
  261. h ^= h >> 29
  262. h *= m3
  263. h ^= h >> 32
  264. return uintptr(h)
  265. }
  266. func add(p unsafe.Pointer, x uintptr) unsafe.Pointer {
  267. return unsafe.Pointer(uintptr(p) + x)
  268. }
  269. func readUnaligned32(p unsafe.Pointer) uint32 {
  270. q := (*[4]byte)(p)
  271. return uint32(q[0]) | uint32(q[1])<<8 | uint32(q[2])<<16 | uint32(q[3])<<24
  272. }
  273. func rotl31(x uint64) uint64 {
  274. return (x << 31) | (x >> (64 - 31))
  275. }
  276. func readUnaligned64(p unsafe.Pointer) uint64 {
  277. q := (*[8]byte)(p)
  278. return uint64(q[0]) | uint64(q[1])<<8 | uint64(q[2])<<16 | uint64(q[3])<<24 | uint64(q[4])<<32 | uint64(q[5])<<40 | uint64(q[6])<<48 | uint64(q[7])<<56
  279. }