Files
openclaw-backups/skills/fast-browser-use/tests/sitemap_integration.rs

327 lines
12 KiB
Rust

//! Integration tests for the sitemap analysis tool
use browser_use::{
BrowserSession, LaunchOptions,
tools::{sitemap::{SitemapTool, SitemapParams}, Tool, ToolContext},
};
use log::info;
#[test]
#[ignore] // Requires Chrome to be installed
fn test_sitemap_tool_basic() {
env_logger::try_init().ok();
let session = BrowserSession::launch(LaunchOptions::new().headless(true))
.expect("Failed to launch browser");
// Test with a real site that has a sitemap (example.com is simple)
let tool = SitemapTool::default();
let mut context = ToolContext::new(&session);
let params = SitemapParams {
url: "https://example.com".to_string(),
analyze_structure: false,
max_pages: 5,
};
let result = tool.execute_typed(params, &mut context).expect("Failed to execute sitemap tool");
assert!(result.success, "Tool execution should succeed");
assert!(result.data.is_some(), "Should have result data");
let data = result.data.unwrap();
info!("Sitemap result: {}", serde_json::to_string_pretty(&data).unwrap());
// Verify basic structure
assert!(data["base_url"].as_str().is_some());
assert!(data["sitemaps"].is_array());
assert!(data["pages"].is_array());
}
#[test]
#[ignore] // Requires Chrome to be installed
fn test_sitemap_tool_with_structure_analysis() {
env_logger::try_init().ok();
let session = BrowserSession::launch(LaunchOptions::new().headless(true))
.expect("Failed to launch browser");
let tool = SitemapTool::default();
let mut context = ToolContext::new(&session);
// Use a site that we know has structure
let params = SitemapParams {
url: "https://example.com".to_string(),
analyze_structure: true,
max_pages: 2,
};
let result = tool.execute_typed(params, &mut context).expect("Failed to execute sitemap tool");
assert!(result.success, "Tool execution should succeed");
assert!(result.data.is_some(), "Should have result data");
let data = result.data.unwrap();
info!("Sitemap with structure: {}", serde_json::to_string_pretty(&data).unwrap());
// Verify structure analysis was performed
let page_structures = data["page_structures"].as_array();
assert!(page_structures.is_some(), "Should have page_structures array");
// If any pages were analyzed, check the structure format
if let Some(structures) = page_structures {
if !structures.is_empty() {
let first = &structures[0];
assert!(first["url"].as_str().is_some(), "Page structure should have url");
assert!(first["title"].as_str().is_some(), "Page structure should have title");
assert!(first["headings"].is_array(), "Page structure should have headings array");
assert!(first["nav_links"].is_array(), "Page structure should have nav_links array");
assert!(first["sections"].is_array(), "Page structure should have sections array");
}
}
}
#[test]
#[ignore] // Requires Chrome to be installed
fn test_sitemap_analyze_function() {
env_logger::try_init().ok();
let session = BrowserSession::launch(LaunchOptions::new().headless(true))
.expect("Failed to launch browser");
// Test the standalone analyze_sitemap function
let result = browser_use::tools::sitemap::analyze_sitemap(
&session,
"https://example.com",
true,
2,
).expect("Failed to analyze sitemap");
info!("Analyze sitemap result: {:?}", result);
assert_eq!(result.base_url, "https://example.com");
// If structure analysis was performed on homepage
if !result.page_structures.is_empty() {
let homepage = &result.page_structures[0];
assert!(!homepage.url.is_empty(), "Should have URL");
assert!(!homepage.title.is_empty(), "Should have title");
}
}
#[test]
#[ignore] // Requires Chrome to be installed
fn test_sitemap_page_structure_extraction() {
env_logger::try_init().ok();
let session = BrowserSession::launch(LaunchOptions::new().headless(true))
.expect("Failed to launch browser");
// Create a test page with known structure using data: URL
let test_html = r#"
<!DOCTYPE html>
<html>
<head>
<title>Test Page</title>
<meta name="description" content="A test page for sitemap analysis">
<meta name="keywords" content="test, sitemap, analysis">
<link rel="canonical" href="https://example.com/test">
</head>
<body>
<header>
<nav>
<a href="/home">Home</a>
<a href="/about">About</a>
<a href="/contact">Contact</a>
</nav>
</header>
<main id="content" role="main">
<h1>Welcome to the Test Page</h1>
<section>
<h2>Section One</h2>
<p>Some content here with multiple words to count.</p>
</section>
<section>
<h2>Section Two</h2>
<p>More content in this section.</p>
</section>
</main>
<aside>
<h3>Sidebar</h3>
</aside>
<footer>
<p>Footer content</p>
</footer>
</body>
</html>
"#;
// Navigate to the test page
let data_url = format!("data:text/html,{}", urlencoding::encode(test_html));
session.navigate(&data_url).expect("Failed to navigate");
session.wait_for_navigation().expect("Failed to wait for navigation");
std::thread::sleep(std::time::Duration::from_millis(500));
// Extract structure using JavaScript evaluation directly
let tab = session.tab().expect("Failed to get tab");
let structure_js = r#"
(function() {
var structure = {
url: window.location.href,
title: document.title,
headings: [],
nav_links: [],
sections: [],
main_content: null,
meta: {}
};
var headings = document.querySelectorAll('h1, h2, h3, h4, h5, h6');
for (var i = 0; i < headings.length && i < 50; i++) {
structure.headings.push({
level: parseInt(headings[i].tagName.charAt(1)),
text: headings[i].innerText.trim().substring(0, 200)
});
}
var navs = document.querySelectorAll('nav a, header a, [role="navigation"] a');
var seenLinks = new Set();
for (var i = 0; i < navs.length && structure.nav_links.length < 30; i++) {
var href = navs[i].getAttribute('href');
var text = navs[i].innerText.trim();
if (href && text && !seenLinks.has(href)) {
seenLinks.add(href);
structure.nav_links.push({ text: text.substring(0, 100), href: href });
}
}
var sections = document.querySelectorAll('main, article, section, aside, footer');
for (var i = 0; i < sections.length && i < 20; i++) {
var el = sections[i];
structure.sections.push({
tag: el.tagName.toLowerCase(),
id: el.id || null,
class: el.className ? el.className.substring(0, 100) : null,
role: el.getAttribute('role') || null
});
}
var main = document.querySelector('main, [role="main"], #main, #content, .main-content');
if (main) {
structure.main_content = {
tag: main.tagName.toLowerCase(),
id: main.id || null,
word_count: main.innerText.split(/\s+/).length
};
}
var metaDesc = document.querySelector('meta[name="description"]');
if (metaDesc) structure.meta.description = metaDesc.getAttribute('content');
return JSON.stringify(structure);
})()
"#;
let result = tab.evaluate(structure_js, false).expect("Failed to evaluate JS");
let json_str = result.value.unwrap();
let json_str = json_str.as_str().unwrap();
let structure: serde_json::Value = serde_json::from_str(json_str).expect("Failed to parse structure");
info!("Page structure: {}", serde_json::to_string_pretty(&structure).unwrap());
// Verify structure extraction
assert_eq!(structure["title"].as_str(), Some("Test Page"));
// Check headings
let headings = structure["headings"].as_array().unwrap();
assert!(headings.len() >= 3, "Should have at least 3 headings (h1, h2, h2, h3)");
assert_eq!(headings[0]["level"].as_u64(), Some(1));
assert_eq!(headings[0]["text"].as_str(), Some("Welcome to the Test Page"));
// Check nav links
let nav_links = structure["nav_links"].as_array().unwrap();
assert_eq!(nav_links.len(), 3, "Should have 3 nav links");
// Check sections
let sections = structure["sections"].as_array().unwrap();
assert!(!sections.is_empty(), "Should have sections");
// Check main content
let main_content = &structure["main_content"];
assert!(main_content.is_object(), "Should have main_content");
assert_eq!(main_content["tag"].as_str(), Some("main"));
assert_eq!(main_content["id"].as_str(), Some("content"));
assert!(main_content["word_count"].as_u64().unwrap() > 0, "Should have word count");
// Check meta
assert_eq!(
structure["meta"]["description"].as_str(),
Some("A test page for sitemap analysis")
);
}
#[test]
#[ignore] // Requires Chrome to be installed
fn test_sitemap_robots_txt_parsing() {
env_logger::try_init().ok();
// This test verifies that we can parse sitemap references from robots.txt
// We'll test with a known site that has a proper robots.txt
let session = BrowserSession::launch(LaunchOptions::new().headless(true))
.expect("Failed to launch browser");
let tool = SitemapTool::default();
let mut context = ToolContext::new(&session);
// Test with a site known to have robots.txt with sitemap
let params = SitemapParams {
url: "https://www.google.com".to_string(),
analyze_structure: false,
max_pages: 1,
};
let result = tool.execute_typed(params, &mut context).expect("Failed to execute sitemap tool");
assert!(result.success, "Tool execution should succeed");
let data = result.data.unwrap();
info!("Google sitemap result: {}", serde_json::to_string_pretty(&data).unwrap());
// Google should have robots.txt
// Note: This test may be flaky depending on network conditions
assert_eq!(data["base_url"].as_str(), Some("https://www.google.com"));
}
#[test]
#[ignore] // Requires Chrome to be installed
fn test_sitemap_max_pages_limit() {
env_logger::try_init().ok();
let session = BrowserSession::launch(LaunchOptions::new().headless(true))
.expect("Failed to launch browser");
let tool = SitemapTool::default();
let mut context = ToolContext::new(&session);
let params = SitemapParams {
url: "https://example.com".to_string(),
analyze_structure: true,
max_pages: 1, // Limit to 1 page
};
let result = tool.execute_typed(params, &mut context).expect("Failed to execute sitemap tool");
assert!(result.success);
let data = result.data.unwrap();
let page_structures = data["page_structures"].as_array().unwrap();
// Should not exceed max_pages
assert!(
page_structures.len() <= 1,
"Should not analyze more than max_pages (1)"
);
}