commit bf78c0a3a00602cb06f03b6281fa84dfd6216e5f
parent eb41846bb91960234606add39cf60e95ddd5e3d3
Author: Terry Yiu <git@tyiu.xyz>
Date: Sun, 7 Jan 2024 14:07:09 -0500
translation: add workaround to reduce wasteful translation requests
Signed-off-by: Terry Yiu <git@tyiu.xyz>
Reviewed-by: William Casarin <jb55@jb55.com>
Signed-off-by: William Casarin <jb55@jb55.com>
Changelog-Fixed: Add workaround to fix note language recognition and reduce wasteful translation requests
Diffstat:
2 files changed, 32 insertions(+), 5 deletions(-)
diff --git a/damus/Util/Translator.swift b/damus/Util/Translator.swift
@@ -23,6 +23,11 @@ public struct Translator {
}
public func translate(_ text: String, from sourceLanguage: String, to targetLanguage: String) async throws -> String? {
+ // Do not attempt to translate if the source and target languages are the same.
+ guard sourceLanguage != targetLanguage else {
+ return nil
+ }
+
switch userSettingsStore.translation_service {
case .purple:
return try await translateWithPurple(text, from: sourceLanguage, to: targetLanguage)
@@ -35,7 +40,7 @@ public struct Translator {
case .deepl:
return try await translateWithDeepL(text, from: sourceLanguage, to: targetLanguage)
case .none:
- return text
+ return nil
}
}
diff --git a/nostrdb/NdbNote.swift b/nostrdb/NdbNote.swift
@@ -411,7 +411,25 @@ extension NdbNote {
let originalBlocks = self.blocks(keypair).blocks
let originalOnlyText = originalBlocks.compactMap {
if case .text(let txt) = $0 {
- return txt
+ // Replacing right single quotation marks (’) with "typewriter or ASCII apostrophes" (')
+ // as a workaround to get Apple's language recognizer to predict language the correctly.
+ // It is important to add this workaround to get the language right because it wastes users' money to send translation requests.
+ // Until Apple fixes their language model, this workaround will be kept in place.
+ // See https://en.wikipedia.org/wiki/Apostrophe#Unicode for an explanation of the differences between the two characters.
+ //
+ // For example,
+ // "nevent1qqs0wsknetaju06xk39cv8sttd064amkykqalvfue7ydtg3p0lyfksqzyrhxagf6h8l9cjngatumrg60uq22v66qz979pm32v985ek54ndh8gj42wtp"
+ // has the note content "It’s a meme".
+ // Without the character replacement, it is 61% confident that the text is in Turkish (tr) and 8% confident that the text is in English (en),
+ // which is a wildly incorrect hypothesis.
+ // With the character replacement, it is 65% confident that the text is in English (en) and 24% confident that the text is in Turkish (tr), which is more accurate.
+ //
+ // Similarly,
+ // "nevent1qqspjqlln6wvxrqg6kzl2p7gk0rgr5stc7zz5sstl34cxlw55gvtylgpp4mhxue69uhkummn9ekx7mqpr4mhxue69uhkummnw3ez6ur4vgh8wetvd3hhyer9wghxuet5qy28wumn8ghj7un9d3shjtnwdaehgu3wvfnsygpx6655ve67vqlcme9ld7ww73pqx7msclhwzu8lqmkhvuluxnyc7yhf3xut"
+ // has the note content "You’re funner".
+ // Without the character replacement, it is 52% confident that the text is in Norwegian Bokmål (nb) and 41% confident that the text is in English (en).
+ // With the character replacement, it is 93% confident that the text is in English (en) and 4% confident that the text is in Norwegian Bokmål (nb).
+ return txt.replacingOccurrences(of: "’", with: "'")
}
else {
return nil
@@ -419,13 +437,17 @@ extension NdbNote {
}
.joined(separator: " ")
- // Only accept language recognition hypothesis if there's at least a 50% probability that it's accurate.
+ // If there is no text, there's nothing to use to detect language.
+ guard !originalOnlyText.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty else {
+ return nil
+ }
+
let languageRecognizer = NLLanguageRecognizer()
languageRecognizer.processString(originalOnlyText)
+ // Only accept language recognition hypothesis if there's at least a 50% probability that it's accurate.
guard let locale = languageRecognizer.languageHypotheses(withMaximum: 1).first(where: { $0.value >= 0.5 })?.key.rawValue else {
- let nstr: String? = nil
- return nstr
+ return nil
}
// Remove the variant component and just take the language part as translation services typically only supports the variant-less language.