hashiverse_lib/tools/
url_preview.rs1use scraper::{Html, Selector};
15
16pub struct UrlPreviewData {
17 pub title: String,
18 pub description: String,
19 pub image_url: String,
20 pub canonical_url: String,
21}
22
23pub fn extract_url_preview(html: &str) -> UrlPreviewData {
24 let document = Html::parse_document(html);
25
26 let title = first_non_empty([
27 select_meta_content(&document, "meta[property='og:title']"),
28 select_meta_content(&document, "meta[name='twitter:title']"),
29 select_title(&document),
30 ])
31 .unwrap_or_default();
32
33 let description = first_non_empty([
34 select_meta_content(&document, "meta[property='og:description']"),
35 select_meta_content(&document, "meta[name='twitter:description']"),
36 select_meta_content(&document, "meta[name='description']"),
37 ])
38 .unwrap_or_default();
39
40 let image_url = first_non_empty([
41 select_meta_content(&document, "meta[property='og:image']"),
42 select_meta_content(&document, "meta[name='twitter:image']"),
43 select_meta_content(&document, "meta[name='twitter:image:src']"),
44 ])
45 .unwrap_or_default();
46
47 let canonical_url = first_non_empty([
48 select_meta_content(&document, "meta[property='og:url']"),
49 select_link_href(&document, "link[rel='canonical']"),
50 ])
51 .unwrap_or_default();
52
53 UrlPreviewData {
54 title,
55 description,
56 image_url,
57 canonical_url,
58 }
59}
60
61fn select_meta_content(document: &Html, selector_str: &str) -> Option<String> {
62 let selector = Selector::parse(selector_str).ok()?;
63 document.select(&selector).next()?.value().attr("content").map(|s| s.to_string())
64}
65
66fn select_link_href(document: &Html, selector_str: &str) -> Option<String> {
67 let selector = Selector::parse(selector_str).ok()?;
68 document.select(&selector).next()?.value().attr("href").map(|s| s.to_string())
69}
70
71fn select_title(document: &Html) -> Option<String> {
72 let selector = Selector::parse("title").ok()?;
73 Some(document.select(&selector).next()?.text().collect::<String>())
74}
75
76fn first_non_empty<I: IntoIterator<Item = Option<String>>>(candidates: I) -> Option<String> {
78 candidates.into_iter().flatten().find(|s| !s.is_empty())
79}
80
81#[cfg(test)]
82mod tests {
83 use super::*;
84
85 #[test]
86 fn test_extract_url_preview_with_og_tags() {
87 let html = r#"
88 <!DOCTYPE html>
89 <html>
90 <head>
91 <meta property="og:title" content="OG Title" />
92 <meta property="og:description" content="OG Description" />
93 <meta property="og:image" content="https://example.com/og.png" />
94 <meta property="og:url" content="https://example.com/canonical" />
95 <title>Page Title</title>
96 </head>
97 <body></body>
98 </html>
99 "#;
100
101 let data = extract_url_preview(html);
102 assert_eq!(data.title, "OG Title");
103 assert_eq!(data.description, "OG Description");
104 assert_eq!(data.image_url, "https://example.com/og.png");
105 assert_eq!(data.canonical_url, "https://example.com/canonical");
106 }
107
108 #[test]
109 fn test_extract_url_preview_fallback_to_title_and_meta_description() {
110 let html = r#"
111 <!DOCTYPE html>
112 <html>
113 <head>
114 <title>Fallback Title</title>
115 <meta name="description" content="Fallback Description" />
116 </head>
117 <body></body>
118 </html>
119 "#;
120
121 let data = extract_url_preview(html);
122 assert_eq!(data.title, "Fallback Title");
123 assert_eq!(data.description, "Fallback Description");
124 assert_eq!(data.image_url, "");
125 assert_eq!(data.canonical_url, "");
126 }
127
128 #[test]
129 fn test_extract_url_preview_empty_html() {
130 let data = extract_url_preview("");
131 assert_eq!(data.title, "");
132 assert_eq!(data.description, "");
133 assert_eq!(data.image_url, "");
134 assert_eq!(data.canonical_url, "");
135 }
136
137 #[cfg(not(target_arch = "wasm32"))]
138 mod bolero_fuzz {
139 use super::*;
140
141 #[test]
142 fn fuzz_extract_url_preview() {
143 bolero::check!().for_each(|data: &[u8]| {
144 if let Ok(html) = std::str::from_utf8(data) {
145 let _ = extract_url_preview(html);
146 }
147 });
148 }
149 }
150
151 #[test]
152 fn test_extract_url_preview_og_overrides_title() {
153 let html = r#"
154 <!DOCTYPE html>
155 <html>
156 <head>
157 <title>Page Title</title>
158 <meta property="og:title" content="OG Title" />
159 </head>
160 <body></body>
161 </html>
162 "#;
163
164 let data = extract_url_preview(html);
165 assert_eq!(data.title, "OG Title");
166 }
167
168 #[test]
169 fn test_twitter_title_used_when_og_missing() {
170 let html = r#"
171 <!DOCTYPE html>
172 <html>
173 <head>
174 <title>HTML Title</title>
175 <meta name="twitter:title" content="Twitter Title" />
176 </head>
177 <body></body>
178 </html>
179 "#;
180
181 let data = extract_url_preview(html);
182 assert_eq!(data.title, "Twitter Title");
183 }
184
185 #[test]
186 fn test_twitter_description_used_when_og_missing() {
187 let html = r#"
188 <!DOCTYPE html>
189 <html>
190 <head>
191 <meta name="twitter:description" content="Twitter Description" />
192 <meta name="description" content="Plain Description" />
193 </head>
194 <body></body>
195 </html>
196 "#;
197
198 let data = extract_url_preview(html);
199 assert_eq!(data.description, "Twitter Description");
200 }
201
202 #[test]
203 fn test_twitter_image_used_when_og_missing() {
204 let html = r#"
205 <!DOCTYPE html>
206 <html>
207 <head>
208 <meta name="twitter:image" content="https://example.com/twitter.png" />
209 </head>
210 <body></body>
211 </html>
212 "#;
213
214 let data = extract_url_preview(html);
215 assert_eq!(data.image_url, "https://example.com/twitter.png");
216 }
217
218 #[test]
219 fn test_twitter_image_src_fallback() {
220 let html = r#"
221 <!DOCTYPE html>
222 <html>
223 <head>
224 <meta name="twitter:image:src" content="https://example.com/twitter-src.png" />
225 </head>
226 <body></body>
227 </html>
228 "#;
229
230 let data = extract_url_preview(html);
231 assert_eq!(data.image_url, "https://example.com/twitter-src.png");
232 }
233
234 #[test]
235 fn test_canonical_link_used_when_og_url_missing() {
236 let html = r#"
237 <!DOCTYPE html>
238 <html>
239 <head>
240 <link rel="canonical" href="https://example.com/canonical-from-link" />
241 </head>
242 <body></body>
243 </html>
244 "#;
245
246 let data = extract_url_preview(html);
247 assert_eq!(data.canonical_url, "https://example.com/canonical-from-link");
248 }
249
250 #[test]
251 fn test_empty_content_falls_through() {
252 let html = r#"
253 <!DOCTYPE html>
254 <html>
255 <head>
256 <meta property="og:title" content="" />
257 <title>Real Title</title>
258 </head>
259 <body></body>
260 </html>
261 "#;
262
263 let data = extract_url_preview(html);
264 assert_eq!(data.title, "Real Title");
265 }
266
267 #[test]
268 fn test_og_still_wins_over_twitter() {
269 let html = r#"
270 <!DOCTYPE html>
271 <html>
272 <head>
273 <meta property="og:title" content="OG Title" />
274 <meta name="twitter:title" content="Twitter Title" />
275 <meta property="og:description" content="OG Description" />
276 <meta name="twitter:description" content="Twitter Description" />
277 <meta property="og:image" content="https://example.com/og.png" />
278 <meta name="twitter:image" content="https://example.com/twitter.png" />
279 <meta property="og:url" content="https://example.com/og-canonical" />
280 <link rel="canonical" href="https://example.com/link-canonical" />
281 </head>
282 <body></body>
283 </html>
284 "#;
285
286 let data = extract_url_preview(html);
287 assert_eq!(data.title, "OG Title");
288 assert_eq!(data.description, "OG Description");
289 assert_eq!(data.image_url, "https://example.com/og.png");
290 assert_eq!(data.canonical_url, "https://example.com/og-canonical");
291 }
292}