1
1

SimHash.cs 2.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122
  1. using System.Linq;
  2. using System.Numerics;
  3. namespace Masuit.Tools.Strings;
  4. public class SimHash
  5. {
  6. private readonly string _tokens;
  7. private readonly BigInteger _strSimHash;
  8. private readonly int _hashBits = 128;
  9. public BigInteger StrSimHash => _strSimHash;
  10. public SimHash(string tokens, int hashBits)
  11. {
  12. _tokens = tokens;
  13. _hashBits = hashBits;
  14. _strSimHash = GetSimHash();
  15. }
  16. public SimHash(string tokens)
  17. {
  18. _tokens = tokens;
  19. _strSimHash = GetSimHash();
  20. }
  21. private BigInteger GetSimHash()
  22. {
  23. var v = new int[_hashBits];
  24. var stringTokens = new SimTokenizer(_tokens);
  25. while (stringTokens.HasMoreTokens())
  26. {
  27. var temp = stringTokens.NextToken();
  28. var t = Hash(temp);
  29. for (var i = 0; i < _hashBits; i++)
  30. {
  31. var bitmask = BigInteger.One << i;
  32. if ((t & bitmask).Sign != 0)
  33. {
  34. v[i] += 1;
  35. }
  36. else
  37. {
  38. v[i] -= 1;
  39. }
  40. }
  41. }
  42. var fingerprint = BigInteger.Zero;
  43. for (var i = 0; i < _hashBits; i++)
  44. {
  45. if (v[i] >= 0)
  46. {
  47. fingerprint += BigInteger.Parse("1") << i;
  48. }
  49. }
  50. return fingerprint;
  51. }
  52. private BigInteger Hash(string source)
  53. {
  54. if (string.IsNullOrEmpty(source))
  55. {
  56. return BigInteger.Zero;
  57. }
  58. var sourceArray = source.ToCharArray();
  59. var x = new BigInteger((long)sourceArray[0] << 7);
  60. var m = BigInteger.Parse("1000003");
  61. var mask = BigInteger.Pow(new BigInteger(2), _hashBits) - BigInteger.One;
  62. x = sourceArray.Select(item => new BigInteger((long)item)).Aggregate(x, (current, temp) => ((current * m) ^ temp) & mask);
  63. x ^= new BigInteger(source.Length);
  64. if (x.Equals(BigInteger.MinusOne))
  65. {
  66. x = new BigInteger(-2);
  67. }
  68. return x;
  69. }
  70. public int HammingDistance(SimHash other)
  71. {
  72. var m = (BigInteger.One << _hashBits) - BigInteger.One;
  73. var x = (_strSimHash ^ other._strSimHash) & m;
  74. var tot = 0;
  75. while (x.Sign != 0)
  76. {
  77. tot += 1;
  78. x &= x - BigInteger.One;
  79. }
  80. return tot;
  81. }
  82. }
  83. //简单的分词法,直接将中文分成单个汉字。可以用其他分词法代替
  84. public class SimTokenizer
  85. {
  86. private readonly string _source;
  87. private int _index;
  88. private readonly int _length;
  89. public SimTokenizer(string source)
  90. {
  91. _source = source;
  92. _index = 0;
  93. _length = (source ?? "").Length;
  94. }
  95. public bool HasMoreTokens()
  96. {
  97. return _index < _length;
  98. }
  99. public string NextToken()
  100. {
  101. var s = _source.Substring(_index, 1);
  102. _index++;
  103. return s;
  104. }
  105. }