sitemap.rs (11493B)
1 //! Sitemap generation for SEO 2 //! 3 //! Generates XML sitemaps from cached events in nostrdb to help search engines 4 //! discover and index Nostr content rendered by notecrumbs. 5 6 use nostr_sdk::ToBech32; 7 use nostrdb::{Filter, Ndb, Transaction}; 8 use std::fmt::Write; 9 use std::sync::OnceLock; 10 use std::time::Instant; 11 12 /// Maximum URLs per sitemap (XML sitemap standard limit is 50,000) 13 const MAX_SITEMAP_URLS: u64 = 10000; 14 15 /// Lookback period for notes (90 days) - shorter for timely content 16 const NOTES_LOOKBACK_DAYS: u64 = 90; 17 18 /// Lookback period for articles (365 days) - longer for evergreen content 19 const ARTICLES_LOOKBACK_DAYS: u64 = 365; 20 21 /// Cached base URL (computed once at first access) 22 static BASE_URL: OnceLock<String> = OnceLock::new(); 23 24 /// Get the base URL from environment or default 25 /// Logs a warning once if not explicitly configured 26 fn get_base_url() -> &'static str { 27 BASE_URL.get_or_init(|| { 28 let url = match std::env::var("NOTECRUMBS_BASE_URL") { 29 Ok(url) => url, 30 Err(_) => { 31 tracing::warn!( 32 "NOTECRUMBS_BASE_URL not set, defaulting to https://damus.io - \ 33 sitemap/robots.txt may point to wrong domain" 34 ); 35 "https://damus.io".to_string() 36 } 37 }; 38 normalize_base_url(&url) 39 }) 40 } 41 42 fn normalize_base_url(url: &str) -> String { 43 url.trim_end_matches('/').to_string() 44 } 45 46 /// Calculate Unix timestamp for N days ago 47 fn days_ago(days: u64) -> u64 { 48 std::time::SystemTime::now() 49 .duration_since(std::time::UNIX_EPOCH) 50 .unwrap_or_default() 51 .as_secs() 52 .saturating_sub(days * 24 * 60 * 60) 53 } 54 55 /// Escape special XML characters in a string 56 fn xml_escape(s: &str) -> String { 57 let mut result = String::with_capacity(s.len()); 58 for c in s.chars() { 59 match c { 60 '&' => result.push_str("&"), 61 '<' => result.push_str("<"), 62 '>' => result.push_str(">"), 63 '"' => result.push_str("""), 64 '\'' => result.push_str("'"), 65 _ => result.push(c), 66 } 67 } 68 result 69 } 70 71 /// Format a Unix timestamp as an ISO 8601 date (YYYY-MM-DD) 72 fn format_lastmod(timestamp: u64) -> String { 73 use std::time::{Duration, UNIX_EPOCH}; 74 75 let datetime = UNIX_EPOCH + Duration::from_secs(timestamp); 76 let secs_since_epoch = datetime 77 .duration_since(UNIX_EPOCH) 78 .unwrap_or_default() 79 .as_secs(); 80 81 // Simple date formatting without external dependencies 82 let days_since_epoch = secs_since_epoch / 86400; 83 let mut year = 1970i32; 84 let mut remaining_days = days_since_epoch as i32; 85 86 loop { 87 let days_in_year = if is_leap_year(year) { 366 } else { 365 }; 88 if remaining_days < days_in_year { 89 break; 90 } 91 remaining_days -= days_in_year; 92 year += 1; 93 } 94 95 let is_leap = is_leap_year(year); 96 let days_in_months: [i32; 12] = if is_leap { 97 [31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31] 98 } else { 99 [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31] 100 }; 101 102 let mut month = 1u32; 103 for days in days_in_months { 104 if remaining_days < days { 105 break; 106 } 107 remaining_days -= days; 108 month += 1; 109 } 110 111 let day = remaining_days + 1; 112 113 format!("{:04}-{:02}-{:02}", year, month, day) 114 } 115 116 fn is_leap_year(year: i32) -> bool { 117 (year % 4 == 0 && year % 100 != 0) || (year % 400 == 0) 118 } 119 120 /// Entry in the sitemap 121 struct SitemapEntry { 122 loc: String, 123 lastmod: String, 124 priority: &'static str, 125 changefreq: &'static str, 126 } 127 128 /// Generate sitemap XML from cached events in nostrdb 129 pub fn generate_sitemap(ndb: &Ndb) -> Result<String, nostrdb::Error> { 130 let start = Instant::now(); 131 let base_url = get_base_url(); 132 let txn = Transaction::new(ndb)?; 133 134 let mut entries: Vec<SitemapEntry> = Vec::new(); 135 let mut notes_count: u64 = 0; 136 let mut articles_count: u64 = 0; 137 let mut profiles_count: u64 = 0; 138 139 // Add homepage 140 entries.push(SitemapEntry { 141 loc: base_url.to_string(), 142 lastmod: format_lastmod( 143 std::time::SystemTime::now() 144 .duration_since(std::time::UNIX_EPOCH) 145 .unwrap_or_default() 146 .as_secs(), 147 ), 148 priority: "1.0", 149 changefreq: "daily", 150 }); 151 152 // Query recent notes (kind:1 - short text notes) 153 // Use since filter to prioritize recent content for SEO freshness 154 let notes_filter = Filter::new() 155 .kinds([1]) 156 .since(days_ago(NOTES_LOOKBACK_DAYS)) 157 .limit(MAX_SITEMAP_URLS) 158 .build(); 159 160 let results = ndb 161 .query(&txn, &[notes_filter], MAX_SITEMAP_URLS as i32) 162 .unwrap_or_default(); 163 for result in results { 164 let Ok(note) = ndb.get_note_by_key(&txn, result.note_key) else { 165 continue; 166 }; 167 let Some(eid) = nostr_sdk::EventId::from_slice(note.id()).ok() else { 168 continue; 169 }; 170 // to_bech32() returns Result<String, Infallible>, so unwrap is safe 171 let bech32 = eid.to_bech32().unwrap(); 172 entries.push(SitemapEntry { 173 loc: format!("{}/{}", base_url, xml_escape(&bech32)), 174 lastmod: format_lastmod(note.created_at()), 175 priority: "0.8", 176 changefreq: "weekly", 177 }); 178 notes_count += 1; 179 } 180 181 // Query long-form articles (kind:30023) 182 // Longer lookback for evergreen content 183 let articles_filter = Filter::new() 184 .kinds([30023]) 185 .since(days_ago(ARTICLES_LOOKBACK_DAYS)) 186 .limit(MAX_SITEMAP_URLS) 187 .build(); 188 189 let results = ndb 190 .query(&txn, &[articles_filter], MAX_SITEMAP_URLS as i32) 191 .unwrap_or_default(); 192 for result in results { 193 let Ok(note) = ndb.get_note_by_key(&txn, result.note_key) else { 194 continue; 195 }; 196 197 // Extract d-tag identifier - skip if missing or empty to avoid 198 // ambiguous URLs and potential collisions across authors 199 let identifier = note 200 .tags() 201 .iter() 202 .find(|tag| tag.count() >= 2 && tag.get_unchecked(0).variant().str() == Some("d")) 203 .and_then(|tag| tag.get_unchecked(1).variant().str()); 204 205 let Some(identifier) = identifier else { 206 continue; 207 }; 208 if identifier.is_empty() { 209 continue; 210 } 211 212 let Some(pk) = nostr_sdk::PublicKey::from_slice(note.pubkey()).ok() else { 213 continue; 214 }; 215 216 // For addressable events, create naddr 217 let kind = nostr::Kind::from(note.kind() as u16); 218 let coord = nostr::nips::nip01::Coordinate::new(kind, pk).identifier(identifier); 219 let Ok(bech32) = coord.to_bech32() else { 220 continue; 221 }; 222 223 entries.push(SitemapEntry { 224 loc: format!("{}/{}", base_url, xml_escape(&bech32)), 225 lastmod: format_lastmod(note.created_at()), 226 priority: "0.9", 227 changefreq: "weekly", 228 }); 229 articles_count += 1; 230 } 231 232 // Query profiles (kind:0 - metadata) 233 // No since filter for profiles - they update less frequently 234 let profiles_filter = Filter::new().kinds([0]).limit(MAX_SITEMAP_URLS).build(); 235 236 let results = ndb 237 .query(&txn, &[profiles_filter], MAX_SITEMAP_URLS as i32) 238 .unwrap_or_default(); 239 for result in results { 240 let Ok(note) = ndb.get_note_by_key(&txn, result.note_key) else { 241 continue; 242 }; 243 let Some(pk) = nostr_sdk::PublicKey::from_slice(note.pubkey()).ok() else { 244 continue; 245 }; 246 // to_bech32() returns Result<String, Infallible>, so unwrap is safe 247 let bech32 = pk.to_bech32().unwrap(); 248 entries.push(SitemapEntry { 249 loc: format!("{}/{}", base_url, xml_escape(&bech32)), 250 lastmod: format_lastmod(note.created_at()), 251 priority: "0.7", 252 changefreq: "weekly", 253 }); 254 profiles_count += 1; 255 } 256 257 // Build XML 258 let mut xml = String::with_capacity(entries.len() * 200); 259 xml.push_str("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"); 260 xml.push_str("<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">\n"); 261 262 for entry in &entries { 263 let _ = write!( 264 xml, 265 " <url>\n <loc>{}</loc>\n <lastmod>{}</lastmod>\n <changefreq>{}</changefreq>\n <priority>{}</priority>\n </url>\n", 266 entry.loc, entry.lastmod, entry.changefreq, entry.priority 267 ); 268 } 269 270 xml.push_str("</urlset>\n"); 271 272 // Record metrics (aggregate stats, not user-tracking) 273 let duration = start.elapsed(); 274 metrics::counter!("sitemap_generations_total", 1); 275 metrics::gauge!( 276 "sitemap_generation_duration_seconds", 277 duration.as_secs_f64() 278 ); 279 metrics::gauge!("sitemap_urls_total", entries.len() as f64); 280 metrics::gauge!("sitemap_notes_count", notes_count as f64); 281 metrics::gauge!("sitemap_articles_count", articles_count as f64); 282 metrics::gauge!("sitemap_profiles_count", profiles_count as f64); 283 284 Ok(xml) 285 } 286 287 /// Generate robots.txt content 288 pub fn generate_robots_txt() -> String { 289 let base_url = get_base_url(); 290 format!( 291 "User-agent: *\n\ 292 Allow: /\n\ 293 Allow: /.well-known/nostr.json\n\ 294 Disallow: /metrics\n\ 295 Disallow: /*.json\n\ 296 \n\ 297 Sitemap: {}/sitemap.xml\n", 298 base_url 299 ) 300 } 301 302 #[cfg(test)] 303 mod tests { 304 use super::*; 305 306 #[test] 307 fn test_xml_escape() { 308 assert_eq!(xml_escape("hello"), "hello"); 309 assert_eq!(xml_escape("a&b"), "a&b"); 310 assert_eq!(xml_escape("<tag>"), "<tag>"); 311 assert_eq!(xml_escape("\"quoted\""), ""quoted""); 312 } 313 314 #[test] 315 fn test_format_lastmod() { 316 // 2024-01-01 00:00:00 UTC = 1704067200 317 assert_eq!(format_lastmod(1704067200), "2024-01-01"); 318 // 2023-06-15 12:00:00 UTC = 1686830400 319 assert_eq!(format_lastmod(1686830400), "2023-06-15"); 320 } 321 322 #[test] 323 fn test_is_leap_year() { 324 assert!(is_leap_year(2000)); 325 assert!(is_leap_year(2024)); 326 assert!(!is_leap_year(1900)); 327 assert!(!is_leap_year(2023)); 328 } 329 330 #[test] 331 fn test_normalize_base_url() { 332 assert_eq!( 333 normalize_base_url("https://example.com/"), 334 "https://example.com" 335 ); 336 assert_eq!( 337 normalize_base_url("https://example.com"), 338 "https://example.com" 339 ); 340 } 341 342 #[test] 343 fn test_days_ago_range() { 344 let start = std::time::SystemTime::now() 345 .duration_since(std::time::UNIX_EPOCH) 346 .unwrap_or_default() 347 .as_secs(); 348 let cutoff = days_ago(1); 349 let end = std::time::SystemTime::now() 350 .duration_since(std::time::UNIX_EPOCH) 351 .unwrap_or_default() 352 .as_secs(); 353 354 let start_cutoff = start.saturating_sub(86400); 355 let end_cutoff = end.saturating_sub(86400); 356 assert!(cutoff >= start_cutoff); 357 assert!(cutoff <= end_cutoff); 358 } 359 360 #[test] 361 fn test_robots_txt_format() { 362 let robots = generate_robots_txt(); 363 assert!(robots.contains("User-agent: *")); 364 assert!(robots.contains("Allow: /")); 365 assert!(robots.contains("Disallow: /metrics")); 366 assert!(robots.contains("Sitemap:")); 367 } 368 }