瀏覽代碼

Implement detection of typos in json using Levenshtein Distance

Ivan Savenko 1 年之前
父節點
當前提交
e09fbe5ea4
共有 1 個文件被更改,包括 81 次插入1 次删除
  1. 81 1
      lib/json/JsonValidator.cpp

+ 81 - 1
lib/json/JsonValidator.cpp

@@ -21,6 +21,80 @@
 
 VCMI_LIB_NAMESPACE_BEGIN
 
+// Algorithm for detection of typos in words
+// Determines how 'different' two strings are - how many changes must be done to turn one string into another one
+// https://en.wikipedia.org/wiki/Levenshtein_distance#Iterative_with_two_matrix_rows
+static int getLevenshteinDistance(const std::string & s, const std::string & t)
+{
+	int n = t.size();
+	int m = s.size();
+
+	// create two work vectors of integer distances
+	std::vector<int> v0(n+1, 0);
+	std::vector<int> v1(n+1, 0);
+
+	// initialize v0 (the previous row of distances)
+	// this row is A[0][i]: edit distance from an empty s to t;
+	// that distance is the number of characters to append to  s to make t.
+	for (int i = 0; i < n; ++i)
+		v0[i] = i;
+
+	for (int i = 0; i < m; ++i)
+	{
+		// calculate v1 (current row distances) from the previous row v0
+
+		// first element of v1 is A[i + 1][0]
+		// edit distance is delete (i + 1) chars from s to match empty t
+		v1[0] = i + 1;
+
+		// use formula to fill in the rest of the row
+		for (int j = 0; j < n; ++j)
+		{
+			// calculating costs for A[i + 1][j + 1]
+			int deletionCost = v0[j + 1] + 1;
+			int insertionCost = v1[j] + 1;
+			int substitutionCost;
+
+			if (s[i] == t[j])
+				substitutionCost = v0[j];
+			else
+				substitutionCost = v0[j] + 1;
+
+			v1[j + 1] = std::min({deletionCost, insertionCost, substitutionCost});
+		}
+
+		// copy v1 (current row) to v0 (previous row) for next iteration
+		// since data in v1 is always invalidated, a swap without copy could be more efficient
+		std::swap(v0, v1);
+	}
+
+	// after the last swap, the results of v1 are now in v0
+	return v0[n];
+}
+
+/// Searches for keys similar to 'target' in 'candidates' map
+/// Returns closest match or empty string if no suitable candidates are found
+static std::string findClosestMatch(JsonMap candidates, std::string target)
+{
+	// Maximum distance at which we can consider strings to be similar
+	// If strings have more different symbols than this number then it is not a typo, but a completely different word
+	static constexpr int maxDistance = 5;
+	int bestDistance = maxDistance;
+	std::string bestMatch;
+
+	for (auto const & candidate : candidates)
+	{
+		int newDistance = getLevenshteinDistance(candidate.first, target);
+
+		if (newDistance < bestDistance)
+		{
+			bestDistance = newDistance;
+			bestMatch = candidate.first;
+		}
+	}
+	return bestMatch;
+}
+
 static std::string emptyCheck(JsonValidator & validator, const JsonNode & baseSchema, const JsonNode & schema, const JsonNode & data)
 {
 	// check is not needed - e.g. incorporated into another check
@@ -417,7 +491,13 @@ static std::string additionalPropertiesCheck(JsonValidator & validator, const Js
 
 			// or, additionalItems field can be bool which indicates if such items are allowed
 			else if(!schema.isNull() && !schema.Bool()) // present and set to false - error
-				errors += validator.makeErrorMessage("Unknown entry found: " + entry.first);
+			{
+				std::string bestCandidate = findClosestMatch(baseSchema["properties"].Struct(), entry.first);
+				if (!bestCandidate.empty())
+					errors += validator.makeErrorMessage("Unknown entry found: '" + entry.first + "'. Perhaps you meant '" + bestCandidate + "'?");
+				else
+					errors += validator.makeErrorMessage("Unknown entry found: " + entry.first);
+			}
 		}
 	}
 	return errors;