commit 58326f679e3fb5b19c0bce667bdc7ea75066813a
parent 90180202b68b84fa19909e5dce08a2247544d31f
Author: kernelkind <kernelkind@gmail.com>
Date: Tue, 13 Feb 2024 22:20:18 -0500
translate: implement string distance for close matches
Implement the levenshtein string distance algorithm for determining
if the translation is too similar to the original content.
Closes: https://github.com/damus-io/damus/issues/1996
Lightning-address: kernelkind@getalby.com
Signed-off-by: kernelkind <kernelkind@gmail.com>
Reviewed-by: William Casarin <jb55@jb55.com>
Link: 20240214032018.57812-1-kernelkind@gmail.com
Signed-off-by: William Casarin <jb55@jb55.com>
Diffstat:
3 files changed, 119 insertions(+), 0 deletions(-)
diff --git a/damus.xcodeproj/project.pbxproj b/damus.xcodeproj/project.pbxproj
@@ -627,6 +627,7 @@
D7FF94002AC7AC5300FD969D /* RelayURL.swift in Sources */ = {isa = PBXBuildFile; fileRef = D7FF93FF2AC7AC5200FD969D /* RelayURL.swift */; };
E02B54182B4DFADA0077FF42 /* Bech32ObjectTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = E02B54172B4DFADA0077FF42 /* Bech32ObjectTests.swift */; };
E04A37C62B544F090029650D /* URIParsing.swift in Sources */ = {isa = PBXBuildFile; fileRef = E04A37C52B544F090029650D /* URIParsing.swift */; };
+ E0E024112B7C19C20075735D /* TranslationTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = E0E024102B7C19C20075735D /* TranslationTests.swift */; };
E4FA1C032A24BB7F00482697 /* SearchSettingsView.swift in Sources */ = {isa = PBXBuildFile; fileRef = E4FA1C022A24BB7F00482697 /* SearchSettingsView.swift */; };
E990020F2955F837003BBC5A /* EditMetadataView.swift in Sources */ = {isa = PBXBuildFile; fileRef = E990020E2955F837003BBC5A /* EditMetadataView.swift */; };
E9E4ED0B295867B900DD7078 /* ThreadView.swift in Sources */ = {isa = PBXBuildFile; fileRef = E9E4ED0A295867B900DD7078 /* ThreadView.swift */; };
@@ -1402,6 +1403,7 @@
D7FF93FF2AC7AC5200FD969D /* RelayURL.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = RelayURL.swift; sourceTree = "<group>"; };
E02B54172B4DFADA0077FF42 /* Bech32ObjectTests.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = Bech32ObjectTests.swift; sourceTree = "<group>"; };
E04A37C52B544F090029650D /* URIParsing.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = URIParsing.swift; sourceTree = "<group>"; };
+ E0E024102B7C19C20075735D /* TranslationTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = TranslationTests.swift; sourceTree = "<group>"; };
E4FA1C022A24BB7F00482697 /* SearchSettingsView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = SearchSettingsView.swift; sourceTree = "<group>"; };
E990020E2955F837003BBC5A /* EditMetadataView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = EditMetadataView.swift; sourceTree = "<group>"; };
E9E4ED0A295867B900DD7078 /* ThreadView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ThreadView.swift; sourceTree = "<group>"; };
@@ -2504,6 +2506,7 @@
D72A2CFF2AD9B66B002AFF62 /* EventViewTests.swift */,
D7315A2B2ACDF4DA0036E30A /* DamusCacheManagerTests.swift */,
B501062C2B363036003874F5 /* AuthIntegrationTests.swift */,
+ E0E024102B7C19C20075735D /* TranslationTests.swift */,
);
path = damusTests;
sourceTree = "<group>";
@@ -3443,6 +3446,7 @@
D7315A2C2ACDF4DA0036E30A /* DamusCacheManagerTests.swift in Sources */,
4C9054852A6AEAA000811EEC /* NdbTests.swift in Sources */,
75AD872B2AA23A460085EF2C /* Block+Tests.swift in Sources */,
+ E0E024112B7C19C20075735D /* TranslationTests.swift in Sources */,
F944F56E29EA9CCC0067B3BF /* DamusParseContentTests.swift in Sources */,
3A5E47C72A4A76C800C0D090 /* TrieTests.swift in Sources */,
B501062D2B363036003874F5 /* AuthIntegrationTests.swift in Sources */,
diff --git a/damus/Components/TranslateView.swift b/damus/Components/TranslateView.swift
@@ -21,6 +21,8 @@ enum TranslateStatus: Equatable {
case not_needed
}
+fileprivate let MIN_UNIQUE_CHARS = 2
+
struct TranslateView: View {
let damus_state: DamusState
let event: NostrEvent
@@ -107,6 +109,10 @@ struct TranslateView: View {
attempt_translation()
}
}
+
+ func translationMeetsStringDistanceRequirements(original: String, translated: String) -> Bool {
+ return levenshteinDistanceIsGreaterThanOrEqualTo(from: original, to: translated, threshold: MIN_UNIQUE_CHARS)
+ }
}
extension View {
@@ -141,6 +147,10 @@ func translate_note(profiles: Profiles, keypair: Keypair, event: NostrEvent, set
// if its the same, give up and don't retry
return .not_needed
}
+
+ guard translationMeetsStringDistanceRequirements(original: originalContent, translated: translated_note) else {
+ return .not_needed
+ }
// Render translated note
let translated_blocks = parse_note_content(content: .content(translated_note, event.tags))
@@ -158,3 +168,50 @@ func current_language() -> String {
}
}
+func levenshteinDistanceIsGreaterThanOrEqualTo(from source: String, to target: String, threshold: Int) -> Bool {
+ let sourceCount = source.count
+ let targetCount = target.count
+
+ // Early return if the difference in lengths is already greater than or equal to the threshold,
+ // indicating the edit distance meets the condition without further calculation.
+ if abs(sourceCount - targetCount) >= threshold {
+ return true
+ }
+
+ var matrix = [[Int]](repeating: [Int](repeating: 0, count: targetCount + 1), count: sourceCount + 1)
+
+ for i in 0...sourceCount {
+ matrix[i][0] = i
+ }
+
+ for j in 0...targetCount {
+ matrix[0][j] = j
+ }
+
+ for i in 1...sourceCount {
+ var rowMin = Int.max
+ for j in 1...targetCount {
+ let sourceIndex = source.index(source.startIndex, offsetBy: i - 1)
+ let targetIndex = target.index(target.startIndex, offsetBy: j - 1)
+
+ let cost = source[sourceIndex] == target[targetIndex] ? 0 : 1
+ matrix[i][j] = min(
+ matrix[i - 1][j] + 1, // Deletion
+ matrix[i][j - 1] + 1, // Insertion
+ matrix[i - 1][j - 1] + cost // Substitution
+ )
+ rowMin = min(rowMin, matrix[i][j])
+ }
+ // If the minimum edit distance found in any row is already greater than or equal to the threshold,
+ // you can conclude the edit distance meets the criteria.
+ if rowMin >= threshold {
+ return true
+ }
+ }
+
+ return matrix[sourceCount][targetCount] >= threshold
+}
+
+func translationMeetsStringDistanceRequirements(original: String, translated: String) -> Bool {
+ return levenshteinDistanceIsGreaterThanOrEqualTo(from: original, to: translated, threshold: MIN_UNIQUE_CHARS)
+}
diff --git a/damusTests/TranslationTests.swift b/damusTests/TranslationTests.swift
@@ -0,0 +1,58 @@
+//
+// TranslationTests.swift
+// damusTests
+//
+// Created by KernelKind on 2/13/24.
+//
+
+import XCTest
+@testable import damus
+
+final class TranslationTests : XCTestCase {
+ let translationStringDistanceCases = [
+ ("test", "test ", false),
+ ("wat", "what", false),
+ ("wat's the wether like", "what's the weather like", true),
+ ("GM GZY⚡️\n\redacted 🍆🦪🤙 https://video.nostr.build/7dadcc39e83cbc37c99fabb883314f29c169c1bd994f1d525bde6e9817facc85.mp4 ", "GM GZY⚡️\n\redacted 🍆🦪🤙 https://video.nostr.build/7dadcc39e83cbc37c99fabb883314f29c169c1bd994f1d525bde6e9817facc85.mp4", false),
+ ("Fucking nostr forever typo’s lol 😂", "Fucking nostr forever typo's lol 😂", false),
+ ("where's the library", "donde esta la libreria", true),
+ ("In America", "En América", true)
+ ]
+
+ func testStringDistanceRequirements() {
+ for (original, translated, expectedVal) in translationStringDistanceCases {
+ XCTAssertEqual(translationMeetsStringDistanceRequirements(original: original, translated: translated), expectedVal)
+ }
+ }
+
+ let levenshteinDistanceCases = [
+ // (original string, mutated string, number of changes from original to mutated)
+ ("hello", "hello", 0), // No change
+ ("123", "1234", 1), // Addition at the end
+ ("abcd", "abcde", 1), // Addition at the end
+ ("abc", "a", 2), // Multiple deletions
+ ("abcdef", "abc", 3), // Multiple deletions
+ ("2024", "2025", 1), // Single substitution
+ ("openai", "opnai", 1), // Single deletion
+ ("swift", "swiift", 1), // Single addition
+ ("language", "languag", 1), // Single deletion at the end
+ ("example", "sxample", 1), // Single substitution at the beginning
+ ("distance", "d1stanc3", 2), // Substitutions
+ ("python", "pyth0n", 1), // Single substitution
+ ("algorithm", "algor1thm", 1), // Single substitution in the middle
+ ("implementation", "implemenation", 1), // Single deletion (typo)
+ ("correction", "correctionn", 1), // Single addition at the end
+ ("levenshtein", "levenshtien", 2), // Transposition
+ ("threshold", "threshhold", 1), // Single addition (double letter)
+ ("functionality", "fuctionality", 1), // Single deletion (common typo)
+ ("assessment", "assesment", 1), // Single deletion (common typo)
+ ("performance", "performence", 1), // Single substitution (common typo)
+ ]
+
+ func testLevenshteinDistance() {
+ for (original, mutated, numChanges) in levenshteinDistanceCases {
+ XCTAssertTrue(levenshteinDistanceIsGreaterThanOrEqualTo(from: original, to: mutated, threshold: numChanges))
+ XCTAssertFalse(levenshteinDistanceIsGreaterThanOrEqualTo(from: original, to: mutated, threshold: numChanges+1))
+ }
+ }
+}