ac_automaton_matcher.go 4.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243
  1. package strmatcher
  2. import (
  3. "container/list"
  4. )
  5. const validCharCount = 53
  6. type MatchType struct {
  7. matchType Type
  8. exist bool
  9. }
  10. const (
  11. TrieEdge bool = true
  12. FailEdge bool = false
  13. )
  14. type Edge struct {
  15. edgeType bool
  16. nextNode int
  17. }
  18. type ACAutomaton struct {
  19. trie [][validCharCount]Edge
  20. fail []int
  21. exists []MatchType
  22. count int
  23. }
  24. func newNode() [validCharCount]Edge {
  25. var s [validCharCount]Edge
  26. for i := range s {
  27. s[i] = Edge{
  28. edgeType: FailEdge,
  29. nextNode: 0,
  30. }
  31. }
  32. return s
  33. }
  34. var char2Index = []int{
  35. 'A': 0,
  36. 'a': 0,
  37. 'B': 1,
  38. 'b': 1,
  39. 'C': 2,
  40. 'c': 2,
  41. 'D': 3,
  42. 'd': 3,
  43. 'E': 4,
  44. 'e': 4,
  45. 'F': 5,
  46. 'f': 5,
  47. 'G': 6,
  48. 'g': 6,
  49. 'H': 7,
  50. 'h': 7,
  51. 'I': 8,
  52. 'i': 8,
  53. 'J': 9,
  54. 'j': 9,
  55. 'K': 10,
  56. 'k': 10,
  57. 'L': 11,
  58. 'l': 11,
  59. 'M': 12,
  60. 'm': 12,
  61. 'N': 13,
  62. 'n': 13,
  63. 'O': 14,
  64. 'o': 14,
  65. 'P': 15,
  66. 'p': 15,
  67. 'Q': 16,
  68. 'q': 16,
  69. 'R': 17,
  70. 'r': 17,
  71. 'S': 18,
  72. 's': 18,
  73. 'T': 19,
  74. 't': 19,
  75. 'U': 20,
  76. 'u': 20,
  77. 'V': 21,
  78. 'v': 21,
  79. 'W': 22,
  80. 'w': 22,
  81. 'X': 23,
  82. 'x': 23,
  83. 'Y': 24,
  84. 'y': 24,
  85. 'Z': 25,
  86. 'z': 25,
  87. '!': 26,
  88. '$': 27,
  89. '&': 28,
  90. '\'': 29,
  91. '(': 30,
  92. ')': 31,
  93. '*': 32,
  94. '+': 33,
  95. ',': 34,
  96. ';': 35,
  97. '=': 36,
  98. ':': 37,
  99. '%': 38,
  100. '-': 39,
  101. '.': 40,
  102. '_': 41,
  103. '~': 42,
  104. '0': 43,
  105. '1': 44,
  106. '2': 45,
  107. '3': 46,
  108. '4': 47,
  109. '5': 48,
  110. '6': 49,
  111. '7': 50,
  112. '8': 51,
  113. '9': 52,
  114. }
  115. func NewACAutomaton() *ACAutomaton {
  116. ac := new(ACAutomaton)
  117. ac.trie = append(ac.trie, newNode())
  118. ac.fail = append(ac.fail, 0)
  119. ac.exists = append(ac.exists, MatchType{
  120. matchType: Full,
  121. exist: false,
  122. })
  123. return ac
  124. }
  125. func (ac *ACAutomaton) Add(domain string, t Type) {
  126. node := 0
  127. for i := len(domain) - 1; i >= 0; i-- {
  128. idx := char2Index[domain[i]]
  129. if ac.trie[node][idx].nextNode == 0 {
  130. ac.count++
  131. if len(ac.trie) < ac.count+1 {
  132. ac.trie = append(ac.trie, newNode())
  133. ac.fail = append(ac.fail, 0)
  134. ac.exists = append(ac.exists, MatchType{
  135. matchType: Full,
  136. exist: false,
  137. })
  138. }
  139. ac.trie[node][idx] = Edge{
  140. edgeType: TrieEdge,
  141. nextNode: ac.count,
  142. }
  143. }
  144. node = ac.trie[node][idx].nextNode
  145. }
  146. ac.exists[node] = MatchType{
  147. matchType: t,
  148. exist: true,
  149. }
  150. switch t {
  151. case Domain:
  152. ac.exists[node] = MatchType{
  153. matchType: Full,
  154. exist: true,
  155. }
  156. idx := char2Index['.']
  157. if ac.trie[node][idx].nextNode == 0 {
  158. ac.count++
  159. if len(ac.trie) < ac.count+1 {
  160. ac.trie = append(ac.trie, newNode())
  161. ac.fail = append(ac.fail, 0)
  162. ac.exists = append(ac.exists, MatchType{
  163. matchType: Full,
  164. exist: false,
  165. })
  166. }
  167. ac.trie[node][idx] = Edge{
  168. edgeType: TrieEdge,
  169. nextNode: ac.count,
  170. }
  171. }
  172. node = ac.trie[node][idx].nextNode
  173. ac.exists[node] = MatchType{
  174. matchType: t,
  175. exist: true,
  176. }
  177. default:
  178. break
  179. }
  180. }
  181. func (ac *ACAutomaton) Build() {
  182. queue := list.New()
  183. for i := 0; i < validCharCount; i++ {
  184. if ac.trie[0][i].nextNode != 0 {
  185. queue.PushBack(ac.trie[0][i])
  186. }
  187. }
  188. for {
  189. front := queue.Front()
  190. if front == nil {
  191. break
  192. } else {
  193. node := front.Value.(Edge).nextNode
  194. queue.Remove(front)
  195. for i := 0; i < validCharCount; i++ {
  196. if ac.trie[node][i].nextNode != 0 {
  197. ac.fail[ac.trie[node][i].nextNode] = ac.trie[ac.fail[node]][i].nextNode
  198. queue.PushBack(ac.trie[node][i])
  199. } else {
  200. ac.trie[node][i] = Edge{
  201. edgeType: FailEdge,
  202. nextNode: ac.trie[ac.fail[node]][i].nextNode,
  203. }
  204. }
  205. }
  206. }
  207. }
  208. }
  209. func (ac *ACAutomaton) Match(s string) bool {
  210. node := 0
  211. fullMatch := true
  212. // 1. the match string is all through trie edge. FULL MATCH or DOMAIN
  213. // 2. the match string is through a fail edge. NOT FULL MATCH
  214. // 2.1 Through a fail edge, but there exists a valid node. SUBSTR
  215. for i := len(s) - 1; i >= 0; i-- {
  216. idx := char2Index[s[i]]
  217. fullMatch = fullMatch && ac.trie[node][idx].edgeType
  218. node = ac.trie[node][idx].nextNode
  219. switch ac.exists[node].matchType {
  220. case Substr:
  221. return true
  222. case Domain:
  223. if fullMatch {
  224. return true
  225. }
  226. default:
  227. break
  228. }
  229. }
  230. return fullMatch && ac.exists[node].exist
  231. }