damus

nostr ios client
git clone git://jb55.com/damus
Log | Files | Refs | README | LICENSE

commit 58326f679e3fb5b19c0bce667bdc7ea75066813a
parent 90180202b68b84fa19909e5dce08a2247544d31f
Author: kernelkind <kernelkind@gmail.com>
Date:   Tue, 13 Feb 2024 22:20:18 -0500

translate: implement string distance for close matches

Implement the levenshtein string distance algorithm for determining
if the translation is too similar to the original content.

Closes: https://github.com/damus-io/damus/issues/1996

Lightning-address: kernelkind@getalby.com
Signed-off-by: kernelkind <kernelkind@gmail.com>
Reviewed-by: William Casarin <jb55@jb55.com>
Link: 20240214032018.57812-1-kernelkind@gmail.com
Signed-off-by: William Casarin <jb55@jb55.com>

Diffstat:
Mdamus.xcodeproj/project.pbxproj | 4++++
Mdamus/Components/TranslateView.swift | 57+++++++++++++++++++++++++++++++++++++++++++++++++++++++++
AdamusTests/TranslationTests.swift | 58++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 119 insertions(+), 0 deletions(-)

diff --git a/damus.xcodeproj/project.pbxproj b/damus.xcodeproj/project.pbxproj @@ -627,6 +627,7 @@ D7FF94002AC7AC5300FD969D /* RelayURL.swift in Sources */ = {isa = PBXBuildFile; fileRef = D7FF93FF2AC7AC5200FD969D /* RelayURL.swift */; }; E02B54182B4DFADA0077FF42 /* Bech32ObjectTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = E02B54172B4DFADA0077FF42 /* Bech32ObjectTests.swift */; }; E04A37C62B544F090029650D /* URIParsing.swift in Sources */ = {isa = PBXBuildFile; fileRef = E04A37C52B544F090029650D /* URIParsing.swift */; }; + E0E024112B7C19C20075735D /* TranslationTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = E0E024102B7C19C20075735D /* TranslationTests.swift */; }; E4FA1C032A24BB7F00482697 /* SearchSettingsView.swift in Sources */ = {isa = PBXBuildFile; fileRef = E4FA1C022A24BB7F00482697 /* SearchSettingsView.swift */; }; E990020F2955F837003BBC5A /* EditMetadataView.swift in Sources */ = {isa = PBXBuildFile; fileRef = E990020E2955F837003BBC5A /* EditMetadataView.swift */; }; E9E4ED0B295867B900DD7078 /* ThreadView.swift in Sources */ = {isa = PBXBuildFile; fileRef = E9E4ED0A295867B900DD7078 /* ThreadView.swift */; }; @@ -1402,6 +1403,7 @@ D7FF93FF2AC7AC5200FD969D /* RelayURL.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = RelayURL.swift; sourceTree = "<group>"; }; E02B54172B4DFADA0077FF42 /* Bech32ObjectTests.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = Bech32ObjectTests.swift; sourceTree = "<group>"; }; E04A37C52B544F090029650D /* URIParsing.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = URIParsing.swift; sourceTree = "<group>"; }; + E0E024102B7C19C20075735D /* TranslationTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = TranslationTests.swift; sourceTree = "<group>"; }; E4FA1C022A24BB7F00482697 /* SearchSettingsView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = SearchSettingsView.swift; sourceTree = "<group>"; }; E990020E2955F837003BBC5A /* EditMetadataView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = EditMetadataView.swift; sourceTree = "<group>"; }; E9E4ED0A295867B900DD7078 /* ThreadView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ThreadView.swift; sourceTree = "<group>"; }; @@ -2504,6 +2506,7 @@ D72A2CFF2AD9B66B002AFF62 /* EventViewTests.swift */, D7315A2B2ACDF4DA0036E30A /* DamusCacheManagerTests.swift */, B501062C2B363036003874F5 /* AuthIntegrationTests.swift */, + E0E024102B7C19C20075735D /* TranslationTests.swift */, ); path = damusTests; sourceTree = "<group>"; @@ -3443,6 +3446,7 @@ D7315A2C2ACDF4DA0036E30A /* DamusCacheManagerTests.swift in Sources */, 4C9054852A6AEAA000811EEC /* NdbTests.swift in Sources */, 75AD872B2AA23A460085EF2C /* Block+Tests.swift in Sources */, + E0E024112B7C19C20075735D /* TranslationTests.swift in Sources */, F944F56E29EA9CCC0067B3BF /* DamusParseContentTests.swift in Sources */, 3A5E47C72A4A76C800C0D090 /* TrieTests.swift in Sources */, B501062D2B363036003874F5 /* AuthIntegrationTests.swift in Sources */, diff --git a/damus/Components/TranslateView.swift b/damus/Components/TranslateView.swift @@ -21,6 +21,8 @@ enum TranslateStatus: Equatable { case not_needed } +fileprivate let MIN_UNIQUE_CHARS = 2 + struct TranslateView: View { let damus_state: DamusState let event: NostrEvent @@ -107,6 +109,10 @@ struct TranslateView: View { attempt_translation() } } + + func translationMeetsStringDistanceRequirements(original: String, translated: String) -> Bool { + return levenshteinDistanceIsGreaterThanOrEqualTo(from: original, to: translated, threshold: MIN_UNIQUE_CHARS) + } } extension View { @@ -141,6 +147,10 @@ func translate_note(profiles: Profiles, keypair: Keypair, event: NostrEvent, set // if its the same, give up and don't retry return .not_needed } + + guard translationMeetsStringDistanceRequirements(original: originalContent, translated: translated_note) else { + return .not_needed + } // Render translated note let translated_blocks = parse_note_content(content: .content(translated_note, event.tags)) @@ -158,3 +168,50 @@ func current_language() -> String { } } +func levenshteinDistanceIsGreaterThanOrEqualTo(from source: String, to target: String, threshold: Int) -> Bool { + let sourceCount = source.count + let targetCount = target.count + + // Early return if the difference in lengths is already greater than or equal to the threshold, + // indicating the edit distance meets the condition without further calculation. + if abs(sourceCount - targetCount) >= threshold { + return true + } + + var matrix = [[Int]](repeating: [Int](repeating: 0, count: targetCount + 1), count: sourceCount + 1) + + for i in 0...sourceCount { + matrix[i][0] = i + } + + for j in 0...targetCount { + matrix[0][j] = j + } + + for i in 1...sourceCount { + var rowMin = Int.max + for j in 1...targetCount { + let sourceIndex = source.index(source.startIndex, offsetBy: i - 1) + let targetIndex = target.index(target.startIndex, offsetBy: j - 1) + + let cost = source[sourceIndex] == target[targetIndex] ? 0 : 1 + matrix[i][j] = min( + matrix[i - 1][j] + 1, // Deletion + matrix[i][j - 1] + 1, // Insertion + matrix[i - 1][j - 1] + cost // Substitution + ) + rowMin = min(rowMin, matrix[i][j]) + } + // If the minimum edit distance found in any row is already greater than or equal to the threshold, + // you can conclude the edit distance meets the criteria. + if rowMin >= threshold { + return true + } + } + + return matrix[sourceCount][targetCount] >= threshold +} + +func translationMeetsStringDistanceRequirements(original: String, translated: String) -> Bool { + return levenshteinDistanceIsGreaterThanOrEqualTo(from: original, to: translated, threshold: MIN_UNIQUE_CHARS) +} diff --git a/damusTests/TranslationTests.swift b/damusTests/TranslationTests.swift @@ -0,0 +1,58 @@ +// +// TranslationTests.swift +// damusTests +// +// Created by KernelKind on 2/13/24. +// + +import XCTest +@testable import damus + +final class TranslationTests : XCTestCase { + let translationStringDistanceCases = [ + ("test", "test ", false), + ("wat", "what", false), + ("wat's the wether like", "what's the weather like", true), + ("GM GZY⚡️\n\redacted 🍆🦪🤙 https://video.nostr.build/7dadcc39e83cbc37c99fabb883314f29c169c1bd994f1d525bde6e9817facc85.mp4 ", "GM GZY⚡️\n\redacted 🍆🦪🤙 https://video.nostr.build/7dadcc39e83cbc37c99fabb883314f29c169c1bd994f1d525bde6e9817facc85.mp4", false), + ("Fucking nostr forever typo’s lol 😂", "Fucking nostr forever typo's lol 😂", false), + ("where's the library", "donde esta la libreria", true), + ("In America", "En América", true) + ] + + func testStringDistanceRequirements() { + for (original, translated, expectedVal) in translationStringDistanceCases { + XCTAssertEqual(translationMeetsStringDistanceRequirements(original: original, translated: translated), expectedVal) + } + } + + let levenshteinDistanceCases = [ + // (original string, mutated string, number of changes from original to mutated) + ("hello", "hello", 0), // No change + ("123", "1234", 1), // Addition at the end + ("abcd", "abcde", 1), // Addition at the end + ("abc", "a", 2), // Multiple deletions + ("abcdef", "abc", 3), // Multiple deletions + ("2024", "2025", 1), // Single substitution + ("openai", "opnai", 1), // Single deletion + ("swift", "swiift", 1), // Single addition + ("language", "languag", 1), // Single deletion at the end + ("example", "sxample", 1), // Single substitution at the beginning + ("distance", "d1stanc3", 2), // Substitutions + ("python", "pyth0n", 1), // Single substitution + ("algorithm", "algor1thm", 1), // Single substitution in the middle + ("implementation", "implemenation", 1), // Single deletion (typo) + ("correction", "correctionn", 1), // Single addition at the end + ("levenshtein", "levenshtien", 2), // Transposition + ("threshold", "threshhold", 1), // Single addition (double letter) + ("functionality", "fuctionality", 1), // Single deletion (common typo) + ("assessment", "assesment", 1), // Single deletion (common typo) + ("performance", "performence", 1), // Single substitution (common typo) + ] + + func testLevenshteinDistance() { + for (original, mutated, numChanges) in levenshteinDistanceCases { + XCTAssertTrue(levenshteinDistanceIsGreaterThanOrEqualTo(from: original, to: mutated, threshold: numChanges)) + XCTAssertFalse(levenshteinDistanceIsGreaterThanOrEqualTo(from: original, to: mutated, threshold: numChanges+1)) + } + } +}