Skip to main content

hashiverse_lib/tools/
url_preview.rs

1//! # Open Graph / link-preview extraction
2//!
3//! Parses the `<head>` of an HTML page and extracts the fields needed to render a link
4//! preview card in a post: title, description, image, and canonical URL.
5//!
6//! Open Graph (`og:title`, `og:description`, `og:image`, `og:url`) is preferred; the
7//! extractor falls back to the page's `<title>` element and `<meta name="description">`
8//! when OG isn't present. The [`UrlPreviewData`] struct is what callers hand up to the
9//! protocol layer — servers fetch the target URL under an RPC budget gated by
10//! `POW_MINIMUM_PER_URL_FETCH` and return this struct back to the client so the preview
11//! card can render without every client individually fetching (and thus leaking its IP to)
12//! the target site.
13
14use scraper::{Html, Selector};
15
16pub struct UrlPreviewData {
17    pub title: String,
18    pub description: String,
19    pub image_url: String,
20    pub canonical_url: String,
21}
22
23pub fn extract_url_preview(html: &str) -> UrlPreviewData {
24    let document = Html::parse_document(html);
25
26    let title = first_non_empty([
27        select_meta_content(&document, "meta[property='og:title']"),
28        select_meta_content(&document, "meta[name='twitter:title']"),
29        select_title(&document),
30    ])
31    .unwrap_or_default();
32
33    let description = first_non_empty([
34        select_meta_content(&document, "meta[property='og:description']"),
35        select_meta_content(&document, "meta[name='twitter:description']"),
36        select_meta_content(&document, "meta[name='description']"),
37    ])
38    .unwrap_or_default();
39
40    let image_url = first_non_empty([
41        select_meta_content(&document, "meta[property='og:image']"),
42        select_meta_content(&document, "meta[name='twitter:image']"),
43        select_meta_content(&document, "meta[name='twitter:image:src']"),
44    ])
45    .unwrap_or_default();
46
47    let canonical_url = first_non_empty([
48        select_meta_content(&document, "meta[property='og:url']"),
49        select_link_href(&document, "link[rel='canonical']"),
50    ])
51    .unwrap_or_default();
52
53    UrlPreviewData {
54        title,
55        description,
56        image_url,
57        canonical_url,
58    }
59}
60
61fn select_meta_content(document: &Html, selector_str: &str) -> Option<String> {
62    let selector = Selector::parse(selector_str).ok()?;
63    document.select(&selector).next()?.value().attr("content").map(|s| s.to_string())
64}
65
66fn select_link_href(document: &Html, selector_str: &str) -> Option<String> {
67    let selector = Selector::parse(selector_str).ok()?;
68    document.select(&selector).next()?.value().attr("href").map(|s| s.to_string())
69}
70
71fn select_title(document: &Html) -> Option<String> {
72    let selector = Selector::parse("title").ok()?;
73    Some(document.select(&selector).next()?.text().collect::<String>())
74}
75
76// A present-but-empty `content=""` should fall through, not short-circuit.
77fn first_non_empty<I: IntoIterator<Item = Option<String>>>(candidates: I) -> Option<String> {
78    candidates.into_iter().flatten().find(|s| !s.is_empty())
79}
80
81#[cfg(test)]
82mod tests {
83    use super::*;
84
85    #[test]
86    fn test_extract_url_preview_with_og_tags() {
87        let html = r#"
88            <!DOCTYPE html>
89            <html>
90            <head>
91                <meta property="og:title" content="OG Title" />
92                <meta property="og:description" content="OG Description" />
93                <meta property="og:image" content="https://example.com/og.png" />
94                <meta property="og:url" content="https://example.com/canonical" />
95                <title>Page Title</title>
96            </head>
97            <body></body>
98            </html>
99        "#;
100
101        let data = extract_url_preview(html);
102        assert_eq!(data.title, "OG Title");
103        assert_eq!(data.description, "OG Description");
104        assert_eq!(data.image_url, "https://example.com/og.png");
105        assert_eq!(data.canonical_url, "https://example.com/canonical");
106    }
107
108    #[test]
109    fn test_extract_url_preview_fallback_to_title_and_meta_description() {
110        let html = r#"
111            <!DOCTYPE html>
112            <html>
113            <head>
114                <title>Fallback Title</title>
115                <meta name="description" content="Fallback Description" />
116            </head>
117            <body></body>
118            </html>
119        "#;
120
121        let data = extract_url_preview(html);
122        assert_eq!(data.title, "Fallback Title");
123        assert_eq!(data.description, "Fallback Description");
124        assert_eq!(data.image_url, "");
125        assert_eq!(data.canonical_url, "");
126    }
127
128    #[test]
129    fn test_extract_url_preview_empty_html() {
130        let data = extract_url_preview("");
131        assert_eq!(data.title, "");
132        assert_eq!(data.description, "");
133        assert_eq!(data.image_url, "");
134        assert_eq!(data.canonical_url, "");
135    }
136
137    #[cfg(not(target_arch = "wasm32"))]
138    mod bolero_fuzz {
139        use super::*;
140
141        #[test]
142        fn fuzz_extract_url_preview() {
143            bolero::check!().for_each(|data: &[u8]| {
144                if let Ok(html) = std::str::from_utf8(data) {
145                    let _ = extract_url_preview(html);
146                }
147            });
148        }
149    }
150
151    #[test]
152    fn test_extract_url_preview_og_overrides_title() {
153        let html = r#"
154            <!DOCTYPE html>
155            <html>
156            <head>
157                <title>Page Title</title>
158                <meta property="og:title" content="OG Title" />
159            </head>
160            <body></body>
161            </html>
162        "#;
163
164        let data = extract_url_preview(html);
165        assert_eq!(data.title, "OG Title");
166    }
167
168    #[test]
169    fn test_twitter_title_used_when_og_missing() {
170        let html = r#"
171            <!DOCTYPE html>
172            <html>
173            <head>
174                <title>HTML Title</title>
175                <meta name="twitter:title" content="Twitter Title" />
176            </head>
177            <body></body>
178            </html>
179        "#;
180
181        let data = extract_url_preview(html);
182        assert_eq!(data.title, "Twitter Title");
183    }
184
185    #[test]
186    fn test_twitter_description_used_when_og_missing() {
187        let html = r#"
188            <!DOCTYPE html>
189            <html>
190            <head>
191                <meta name="twitter:description" content="Twitter Description" />
192                <meta name="description" content="Plain Description" />
193            </head>
194            <body></body>
195            </html>
196        "#;
197
198        let data = extract_url_preview(html);
199        assert_eq!(data.description, "Twitter Description");
200    }
201
202    #[test]
203    fn test_twitter_image_used_when_og_missing() {
204        let html = r#"
205            <!DOCTYPE html>
206            <html>
207            <head>
208                <meta name="twitter:image" content="https://example.com/twitter.png" />
209            </head>
210            <body></body>
211            </html>
212        "#;
213
214        let data = extract_url_preview(html);
215        assert_eq!(data.image_url, "https://example.com/twitter.png");
216    }
217
218    #[test]
219    fn test_twitter_image_src_fallback() {
220        let html = r#"
221            <!DOCTYPE html>
222            <html>
223            <head>
224                <meta name="twitter:image:src" content="https://example.com/twitter-src.png" />
225            </head>
226            <body></body>
227            </html>
228        "#;
229
230        let data = extract_url_preview(html);
231        assert_eq!(data.image_url, "https://example.com/twitter-src.png");
232    }
233
234    #[test]
235    fn test_canonical_link_used_when_og_url_missing() {
236        let html = r#"
237            <!DOCTYPE html>
238            <html>
239            <head>
240                <link rel="canonical" href="https://example.com/canonical-from-link" />
241            </head>
242            <body></body>
243            </html>
244        "#;
245
246        let data = extract_url_preview(html);
247        assert_eq!(data.canonical_url, "https://example.com/canonical-from-link");
248    }
249
250    #[test]
251    fn test_empty_content_falls_through() {
252        let html = r#"
253            <!DOCTYPE html>
254            <html>
255            <head>
256                <meta property="og:title" content="" />
257                <title>Real Title</title>
258            </head>
259            <body></body>
260            </html>
261        "#;
262
263        let data = extract_url_preview(html);
264        assert_eq!(data.title, "Real Title");
265    }
266
267    #[test]
268    fn test_og_still_wins_over_twitter() {
269        let html = r#"
270            <!DOCTYPE html>
271            <html>
272            <head>
273                <meta property="og:title" content="OG Title" />
274                <meta name="twitter:title" content="Twitter Title" />
275                <meta property="og:description" content="OG Description" />
276                <meta name="twitter:description" content="Twitter Description" />
277                <meta property="og:image" content="https://example.com/og.png" />
278                <meta name="twitter:image" content="https://example.com/twitter.png" />
279                <meta property="og:url" content="https://example.com/og-canonical" />
280                <link rel="canonical" href="https://example.com/link-canonical" />
281            </head>
282            <body></body>
283            </html>
284        "#;
285
286        let data = extract_url_preview(html);
287        assert_eq!(data.title, "OG Title");
288        assert_eq!(data.description, "OG Description");
289        assert_eq!(data.image_url, "https://example.com/og.png");
290        assert_eq!(data.canonical_url, "https://example.com/og-canonical");
291    }
292}