notecrumbs

a nostr opengraph server build on nostrdb and egui
git clone git://jb55.com/notecrumbs
Log | Files | Refs | README | LICENSE

sitemap.rs (11493B)


      1 //! Sitemap generation for SEO
      2 //!
      3 //! Generates XML sitemaps from cached events in nostrdb to help search engines
      4 //! discover and index Nostr content rendered by notecrumbs.
      5 
      6 use nostr_sdk::ToBech32;
      7 use nostrdb::{Filter, Ndb, Transaction};
      8 use std::fmt::Write;
      9 use std::sync::OnceLock;
     10 use std::time::Instant;
     11 
     12 /// Maximum URLs per sitemap (XML sitemap standard limit is 50,000)
     13 const MAX_SITEMAP_URLS: u64 = 10000;
     14 
     15 /// Lookback period for notes (90 days) - shorter for timely content
     16 const NOTES_LOOKBACK_DAYS: u64 = 90;
     17 
     18 /// Lookback period for articles (365 days) - longer for evergreen content
     19 const ARTICLES_LOOKBACK_DAYS: u64 = 365;
     20 
     21 /// Cached base URL (computed once at first access)
     22 static BASE_URL: OnceLock<String> = OnceLock::new();
     23 
     24 /// Get the base URL from environment or default
     25 /// Logs a warning once if not explicitly configured
     26 fn get_base_url() -> &'static str {
     27     BASE_URL.get_or_init(|| {
     28         let url = match std::env::var("NOTECRUMBS_BASE_URL") {
     29             Ok(url) => url,
     30             Err(_) => {
     31                 tracing::warn!(
     32                     "NOTECRUMBS_BASE_URL not set, defaulting to https://damus.io - \
     33                      sitemap/robots.txt may point to wrong domain"
     34                 );
     35                 "https://damus.io".to_string()
     36             }
     37         };
     38         normalize_base_url(&url)
     39     })
     40 }
     41 
     42 fn normalize_base_url(url: &str) -> String {
     43     url.trim_end_matches('/').to_string()
     44 }
     45 
     46 /// Calculate Unix timestamp for N days ago
     47 fn days_ago(days: u64) -> u64 {
     48     std::time::SystemTime::now()
     49         .duration_since(std::time::UNIX_EPOCH)
     50         .unwrap_or_default()
     51         .as_secs()
     52         .saturating_sub(days * 24 * 60 * 60)
     53 }
     54 
     55 /// Escape special XML characters in a string
     56 fn xml_escape(s: &str) -> String {
     57     let mut result = String::with_capacity(s.len());
     58     for c in s.chars() {
     59         match c {
     60             '&' => result.push_str("&amp;"),
     61             '<' => result.push_str("&lt;"),
     62             '>' => result.push_str("&gt;"),
     63             '"' => result.push_str("&quot;"),
     64             '\'' => result.push_str("&apos;"),
     65             _ => result.push(c),
     66         }
     67     }
     68     result
     69 }
     70 
     71 /// Format a Unix timestamp as an ISO 8601 date (YYYY-MM-DD)
     72 fn format_lastmod(timestamp: u64) -> String {
     73     use std::time::{Duration, UNIX_EPOCH};
     74 
     75     let datetime = UNIX_EPOCH + Duration::from_secs(timestamp);
     76     let secs_since_epoch = datetime
     77         .duration_since(UNIX_EPOCH)
     78         .unwrap_or_default()
     79         .as_secs();
     80 
     81     // Simple date formatting without external dependencies
     82     let days_since_epoch = secs_since_epoch / 86400;
     83     let mut year = 1970i32;
     84     let mut remaining_days = days_since_epoch as i32;
     85 
     86     loop {
     87         let days_in_year = if is_leap_year(year) { 366 } else { 365 };
     88         if remaining_days < days_in_year {
     89             break;
     90         }
     91         remaining_days -= days_in_year;
     92         year += 1;
     93     }
     94 
     95     let is_leap = is_leap_year(year);
     96     let days_in_months: [i32; 12] = if is_leap {
     97         [31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
     98     } else {
     99         [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
    100     };
    101 
    102     let mut month = 1u32;
    103     for days in days_in_months {
    104         if remaining_days < days {
    105             break;
    106         }
    107         remaining_days -= days;
    108         month += 1;
    109     }
    110 
    111     let day = remaining_days + 1;
    112 
    113     format!("{:04}-{:02}-{:02}", year, month, day)
    114 }
    115 
    116 fn is_leap_year(year: i32) -> bool {
    117     (year % 4 == 0 && year % 100 != 0) || (year % 400 == 0)
    118 }
    119 
    120 /// Entry in the sitemap
    121 struct SitemapEntry {
    122     loc: String,
    123     lastmod: String,
    124     priority: &'static str,
    125     changefreq: &'static str,
    126 }
    127 
    128 /// Generate sitemap XML from cached events in nostrdb
    129 pub fn generate_sitemap(ndb: &Ndb) -> Result<String, nostrdb::Error> {
    130     let start = Instant::now();
    131     let base_url = get_base_url();
    132     let txn = Transaction::new(ndb)?;
    133 
    134     let mut entries: Vec<SitemapEntry> = Vec::new();
    135     let mut notes_count: u64 = 0;
    136     let mut articles_count: u64 = 0;
    137     let mut profiles_count: u64 = 0;
    138 
    139     // Add homepage
    140     entries.push(SitemapEntry {
    141         loc: base_url.to_string(),
    142         lastmod: format_lastmod(
    143             std::time::SystemTime::now()
    144                 .duration_since(std::time::UNIX_EPOCH)
    145                 .unwrap_or_default()
    146                 .as_secs(),
    147         ),
    148         priority: "1.0",
    149         changefreq: "daily",
    150     });
    151 
    152     // Query recent notes (kind:1 - short text notes)
    153     // Use since filter to prioritize recent content for SEO freshness
    154     let notes_filter = Filter::new()
    155         .kinds([1])
    156         .since(days_ago(NOTES_LOOKBACK_DAYS))
    157         .limit(MAX_SITEMAP_URLS)
    158         .build();
    159 
    160     let results = ndb
    161         .query(&txn, &[notes_filter], MAX_SITEMAP_URLS as i32)
    162         .unwrap_or_default();
    163     for result in results {
    164         let Ok(note) = ndb.get_note_by_key(&txn, result.note_key) else {
    165             continue;
    166         };
    167         let Some(eid) = nostr_sdk::EventId::from_slice(note.id()).ok() else {
    168             continue;
    169         };
    170         // to_bech32() returns Result<String, Infallible>, so unwrap is safe
    171         let bech32 = eid.to_bech32().unwrap();
    172         entries.push(SitemapEntry {
    173             loc: format!("{}/{}", base_url, xml_escape(&bech32)),
    174             lastmod: format_lastmod(note.created_at()),
    175             priority: "0.8",
    176             changefreq: "weekly",
    177         });
    178         notes_count += 1;
    179     }
    180 
    181     // Query long-form articles (kind:30023)
    182     // Longer lookback for evergreen content
    183     let articles_filter = Filter::new()
    184         .kinds([30023])
    185         .since(days_ago(ARTICLES_LOOKBACK_DAYS))
    186         .limit(MAX_SITEMAP_URLS)
    187         .build();
    188 
    189     let results = ndb
    190         .query(&txn, &[articles_filter], MAX_SITEMAP_URLS as i32)
    191         .unwrap_or_default();
    192     for result in results {
    193         let Ok(note) = ndb.get_note_by_key(&txn, result.note_key) else {
    194             continue;
    195         };
    196 
    197         // Extract d-tag identifier - skip if missing or empty to avoid
    198         // ambiguous URLs and potential collisions across authors
    199         let identifier = note
    200             .tags()
    201             .iter()
    202             .find(|tag| tag.count() >= 2 && tag.get_unchecked(0).variant().str() == Some("d"))
    203             .and_then(|tag| tag.get_unchecked(1).variant().str());
    204 
    205         let Some(identifier) = identifier else {
    206             continue;
    207         };
    208         if identifier.is_empty() {
    209             continue;
    210         }
    211 
    212         let Some(pk) = nostr_sdk::PublicKey::from_slice(note.pubkey()).ok() else {
    213             continue;
    214         };
    215 
    216         // For addressable events, create naddr
    217         let kind = nostr::Kind::from(note.kind() as u16);
    218         let coord = nostr::nips::nip01::Coordinate::new(kind, pk).identifier(identifier);
    219         let Ok(bech32) = coord.to_bech32() else {
    220             continue;
    221         };
    222 
    223         entries.push(SitemapEntry {
    224             loc: format!("{}/{}", base_url, xml_escape(&bech32)),
    225             lastmod: format_lastmod(note.created_at()),
    226             priority: "0.9",
    227             changefreq: "weekly",
    228         });
    229         articles_count += 1;
    230     }
    231 
    232     // Query profiles (kind:0 - metadata)
    233     // No since filter for profiles - they update less frequently
    234     let profiles_filter = Filter::new().kinds([0]).limit(MAX_SITEMAP_URLS).build();
    235 
    236     let results = ndb
    237         .query(&txn, &[profiles_filter], MAX_SITEMAP_URLS as i32)
    238         .unwrap_or_default();
    239     for result in results {
    240         let Ok(note) = ndb.get_note_by_key(&txn, result.note_key) else {
    241             continue;
    242         };
    243         let Some(pk) = nostr_sdk::PublicKey::from_slice(note.pubkey()).ok() else {
    244             continue;
    245         };
    246         // to_bech32() returns Result<String, Infallible>, so unwrap is safe
    247         let bech32 = pk.to_bech32().unwrap();
    248         entries.push(SitemapEntry {
    249             loc: format!("{}/{}", base_url, xml_escape(&bech32)),
    250             lastmod: format_lastmod(note.created_at()),
    251             priority: "0.7",
    252             changefreq: "weekly",
    253         });
    254         profiles_count += 1;
    255     }
    256 
    257     // Build XML
    258     let mut xml = String::with_capacity(entries.len() * 200);
    259     xml.push_str("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n");
    260     xml.push_str("<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">\n");
    261 
    262     for entry in &entries {
    263         let _ = write!(
    264             xml,
    265             "  <url>\n    <loc>{}</loc>\n    <lastmod>{}</lastmod>\n    <changefreq>{}</changefreq>\n    <priority>{}</priority>\n  </url>\n",
    266             entry.loc, entry.lastmod, entry.changefreq, entry.priority
    267         );
    268     }
    269 
    270     xml.push_str("</urlset>\n");
    271 
    272     // Record metrics (aggregate stats, not user-tracking)
    273     let duration = start.elapsed();
    274     metrics::counter!("sitemap_generations_total", 1);
    275     metrics::gauge!(
    276         "sitemap_generation_duration_seconds",
    277         duration.as_secs_f64()
    278     );
    279     metrics::gauge!("sitemap_urls_total", entries.len() as f64);
    280     metrics::gauge!("sitemap_notes_count", notes_count as f64);
    281     metrics::gauge!("sitemap_articles_count", articles_count as f64);
    282     metrics::gauge!("sitemap_profiles_count", profiles_count as f64);
    283 
    284     Ok(xml)
    285 }
    286 
    287 /// Generate robots.txt content
    288 pub fn generate_robots_txt() -> String {
    289     let base_url = get_base_url();
    290     format!(
    291         "User-agent: *\n\
    292          Allow: /\n\
    293          Allow: /.well-known/nostr.json\n\
    294          Disallow: /metrics\n\
    295          Disallow: /*.json\n\
    296          \n\
    297          Sitemap: {}/sitemap.xml\n",
    298         base_url
    299     )
    300 }
    301 
    302 #[cfg(test)]
    303 mod tests {
    304     use super::*;
    305 
    306     #[test]
    307     fn test_xml_escape() {
    308         assert_eq!(xml_escape("hello"), "hello");
    309         assert_eq!(xml_escape("a&b"), "a&amp;b");
    310         assert_eq!(xml_escape("<tag>"), "&lt;tag&gt;");
    311         assert_eq!(xml_escape("\"quoted\""), "&quot;quoted&quot;");
    312     }
    313 
    314     #[test]
    315     fn test_format_lastmod() {
    316         // 2024-01-01 00:00:00 UTC = 1704067200
    317         assert_eq!(format_lastmod(1704067200), "2024-01-01");
    318         // 2023-06-15 12:00:00 UTC = 1686830400
    319         assert_eq!(format_lastmod(1686830400), "2023-06-15");
    320     }
    321 
    322     #[test]
    323     fn test_is_leap_year() {
    324         assert!(is_leap_year(2000));
    325         assert!(is_leap_year(2024));
    326         assert!(!is_leap_year(1900));
    327         assert!(!is_leap_year(2023));
    328     }
    329 
    330     #[test]
    331     fn test_normalize_base_url() {
    332         assert_eq!(
    333             normalize_base_url("https://example.com/"),
    334             "https://example.com"
    335         );
    336         assert_eq!(
    337             normalize_base_url("https://example.com"),
    338             "https://example.com"
    339         );
    340     }
    341 
    342     #[test]
    343     fn test_days_ago_range() {
    344         let start = std::time::SystemTime::now()
    345             .duration_since(std::time::UNIX_EPOCH)
    346             .unwrap_or_default()
    347             .as_secs();
    348         let cutoff = days_ago(1);
    349         let end = std::time::SystemTime::now()
    350             .duration_since(std::time::UNIX_EPOCH)
    351             .unwrap_or_default()
    352             .as_secs();
    353 
    354         let start_cutoff = start.saturating_sub(86400);
    355         let end_cutoff = end.saturating_sub(86400);
    356         assert!(cutoff >= start_cutoff);
    357         assert!(cutoff <= end_cutoff);
    358     }
    359 
    360     #[test]
    361     fn test_robots_txt_format() {
    362         let robots = generate_robots_txt();
    363         assert!(robots.contains("User-agent: *"));
    364         assert!(robots.contains("Allow: /"));
    365         assert!(robots.contains("Disallow: /metrics"));
    366         assert!(robots.contains("Sitemap:"));
    367     }
    368 }