327 lines
12 KiB
Rust
327 lines
12 KiB
Rust
//! Integration tests for the sitemap analysis tool
|
|
|
|
use browser_use::{
|
|
BrowserSession, LaunchOptions,
|
|
tools::{sitemap::{SitemapTool, SitemapParams}, Tool, ToolContext},
|
|
};
|
|
use log::info;
|
|
|
|
#[test]
|
|
#[ignore] // Requires Chrome to be installed
|
|
fn test_sitemap_tool_basic() {
|
|
env_logger::try_init().ok();
|
|
|
|
let session = BrowserSession::launch(LaunchOptions::new().headless(true))
|
|
.expect("Failed to launch browser");
|
|
|
|
// Test with a real site that has a sitemap (example.com is simple)
|
|
let tool = SitemapTool::default();
|
|
let mut context = ToolContext::new(&session);
|
|
|
|
let params = SitemapParams {
|
|
url: "https://example.com".to_string(),
|
|
analyze_structure: false,
|
|
max_pages: 5,
|
|
};
|
|
|
|
let result = tool.execute_typed(params, &mut context).expect("Failed to execute sitemap tool");
|
|
|
|
assert!(result.success, "Tool execution should succeed");
|
|
assert!(result.data.is_some(), "Should have result data");
|
|
|
|
let data = result.data.unwrap();
|
|
info!("Sitemap result: {}", serde_json::to_string_pretty(&data).unwrap());
|
|
|
|
// Verify basic structure
|
|
assert!(data["base_url"].as_str().is_some());
|
|
assert!(data["sitemaps"].is_array());
|
|
assert!(data["pages"].is_array());
|
|
}
|
|
|
|
#[test]
|
|
#[ignore] // Requires Chrome to be installed
|
|
fn test_sitemap_tool_with_structure_analysis() {
|
|
env_logger::try_init().ok();
|
|
|
|
let session = BrowserSession::launch(LaunchOptions::new().headless(true))
|
|
.expect("Failed to launch browser");
|
|
|
|
let tool = SitemapTool::default();
|
|
let mut context = ToolContext::new(&session);
|
|
|
|
// Use a site that we know has structure
|
|
let params = SitemapParams {
|
|
url: "https://example.com".to_string(),
|
|
analyze_structure: true,
|
|
max_pages: 2,
|
|
};
|
|
|
|
let result = tool.execute_typed(params, &mut context).expect("Failed to execute sitemap tool");
|
|
|
|
assert!(result.success, "Tool execution should succeed");
|
|
assert!(result.data.is_some(), "Should have result data");
|
|
|
|
let data = result.data.unwrap();
|
|
info!("Sitemap with structure: {}", serde_json::to_string_pretty(&data).unwrap());
|
|
|
|
// Verify structure analysis was performed
|
|
let page_structures = data["page_structures"].as_array();
|
|
assert!(page_structures.is_some(), "Should have page_structures array");
|
|
|
|
// If any pages were analyzed, check the structure format
|
|
if let Some(structures) = page_structures {
|
|
if !structures.is_empty() {
|
|
let first = &structures[0];
|
|
assert!(first["url"].as_str().is_some(), "Page structure should have url");
|
|
assert!(first["title"].as_str().is_some(), "Page structure should have title");
|
|
assert!(first["headings"].is_array(), "Page structure should have headings array");
|
|
assert!(first["nav_links"].is_array(), "Page structure should have nav_links array");
|
|
assert!(first["sections"].is_array(), "Page structure should have sections array");
|
|
}
|
|
}
|
|
}
|
|
|
|
#[test]
|
|
#[ignore] // Requires Chrome to be installed
|
|
fn test_sitemap_analyze_function() {
|
|
env_logger::try_init().ok();
|
|
|
|
let session = BrowserSession::launch(LaunchOptions::new().headless(true))
|
|
.expect("Failed to launch browser");
|
|
|
|
// Test the standalone analyze_sitemap function
|
|
let result = browser_use::tools::sitemap::analyze_sitemap(
|
|
&session,
|
|
"https://example.com",
|
|
true,
|
|
2,
|
|
).expect("Failed to analyze sitemap");
|
|
|
|
info!("Analyze sitemap result: {:?}", result);
|
|
|
|
assert_eq!(result.base_url, "https://example.com");
|
|
|
|
// If structure analysis was performed on homepage
|
|
if !result.page_structures.is_empty() {
|
|
let homepage = &result.page_structures[0];
|
|
assert!(!homepage.url.is_empty(), "Should have URL");
|
|
assert!(!homepage.title.is_empty(), "Should have title");
|
|
}
|
|
}
|
|
|
|
#[test]
|
|
#[ignore] // Requires Chrome to be installed
|
|
fn test_sitemap_page_structure_extraction() {
|
|
env_logger::try_init().ok();
|
|
|
|
let session = BrowserSession::launch(LaunchOptions::new().headless(true))
|
|
.expect("Failed to launch browser");
|
|
|
|
// Create a test page with known structure using data: URL
|
|
let test_html = r#"
|
|
<!DOCTYPE html>
|
|
<html>
|
|
<head>
|
|
<title>Test Page</title>
|
|
<meta name="description" content="A test page for sitemap analysis">
|
|
<meta name="keywords" content="test, sitemap, analysis">
|
|
<link rel="canonical" href="https://example.com/test">
|
|
</head>
|
|
<body>
|
|
<header>
|
|
<nav>
|
|
<a href="/home">Home</a>
|
|
<a href="/about">About</a>
|
|
<a href="/contact">Contact</a>
|
|
</nav>
|
|
</header>
|
|
<main id="content" role="main">
|
|
<h1>Welcome to the Test Page</h1>
|
|
<section>
|
|
<h2>Section One</h2>
|
|
<p>Some content here with multiple words to count.</p>
|
|
</section>
|
|
<section>
|
|
<h2>Section Two</h2>
|
|
<p>More content in this section.</p>
|
|
</section>
|
|
</main>
|
|
<aside>
|
|
<h3>Sidebar</h3>
|
|
</aside>
|
|
<footer>
|
|
<p>Footer content</p>
|
|
</footer>
|
|
</body>
|
|
</html>
|
|
"#;
|
|
|
|
// Navigate to the test page
|
|
let data_url = format!("data:text/html,{}", urlencoding::encode(test_html));
|
|
session.navigate(&data_url).expect("Failed to navigate");
|
|
session.wait_for_navigation().expect("Failed to wait for navigation");
|
|
|
|
std::thread::sleep(std::time::Duration::from_millis(500));
|
|
|
|
// Extract structure using JavaScript evaluation directly
|
|
let tab = session.tab().expect("Failed to get tab");
|
|
let structure_js = r#"
|
|
(function() {
|
|
var structure = {
|
|
url: window.location.href,
|
|
title: document.title,
|
|
headings: [],
|
|
nav_links: [],
|
|
sections: [],
|
|
main_content: null,
|
|
meta: {}
|
|
};
|
|
|
|
var headings = document.querySelectorAll('h1, h2, h3, h4, h5, h6');
|
|
for (var i = 0; i < headings.length && i < 50; i++) {
|
|
structure.headings.push({
|
|
level: parseInt(headings[i].tagName.charAt(1)),
|
|
text: headings[i].innerText.trim().substring(0, 200)
|
|
});
|
|
}
|
|
|
|
var navs = document.querySelectorAll('nav a, header a, [role="navigation"] a');
|
|
var seenLinks = new Set();
|
|
for (var i = 0; i < navs.length && structure.nav_links.length < 30; i++) {
|
|
var href = navs[i].getAttribute('href');
|
|
var text = navs[i].innerText.trim();
|
|
if (href && text && !seenLinks.has(href)) {
|
|
seenLinks.add(href);
|
|
structure.nav_links.push({ text: text.substring(0, 100), href: href });
|
|
}
|
|
}
|
|
|
|
var sections = document.querySelectorAll('main, article, section, aside, footer');
|
|
for (var i = 0; i < sections.length && i < 20; i++) {
|
|
var el = sections[i];
|
|
structure.sections.push({
|
|
tag: el.tagName.toLowerCase(),
|
|
id: el.id || null,
|
|
class: el.className ? el.className.substring(0, 100) : null,
|
|
role: el.getAttribute('role') || null
|
|
});
|
|
}
|
|
|
|
var main = document.querySelector('main, [role="main"], #main, #content, .main-content');
|
|
if (main) {
|
|
structure.main_content = {
|
|
tag: main.tagName.toLowerCase(),
|
|
id: main.id || null,
|
|
word_count: main.innerText.split(/\s+/).length
|
|
};
|
|
}
|
|
|
|
var metaDesc = document.querySelector('meta[name="description"]');
|
|
if (metaDesc) structure.meta.description = metaDesc.getAttribute('content');
|
|
|
|
return JSON.stringify(structure);
|
|
})()
|
|
"#;
|
|
|
|
let result = tab.evaluate(structure_js, false).expect("Failed to evaluate JS");
|
|
let json_str = result.value.unwrap();
|
|
let json_str = json_str.as_str().unwrap();
|
|
|
|
let structure: serde_json::Value = serde_json::from_str(json_str).expect("Failed to parse structure");
|
|
|
|
info!("Page structure: {}", serde_json::to_string_pretty(&structure).unwrap());
|
|
|
|
// Verify structure extraction
|
|
assert_eq!(structure["title"].as_str(), Some("Test Page"));
|
|
|
|
// Check headings
|
|
let headings = structure["headings"].as_array().unwrap();
|
|
assert!(headings.len() >= 3, "Should have at least 3 headings (h1, h2, h2, h3)");
|
|
assert_eq!(headings[0]["level"].as_u64(), Some(1));
|
|
assert_eq!(headings[0]["text"].as_str(), Some("Welcome to the Test Page"));
|
|
|
|
// Check nav links
|
|
let nav_links = structure["nav_links"].as_array().unwrap();
|
|
assert_eq!(nav_links.len(), 3, "Should have 3 nav links");
|
|
|
|
// Check sections
|
|
let sections = structure["sections"].as_array().unwrap();
|
|
assert!(!sections.is_empty(), "Should have sections");
|
|
|
|
// Check main content
|
|
let main_content = &structure["main_content"];
|
|
assert!(main_content.is_object(), "Should have main_content");
|
|
assert_eq!(main_content["tag"].as_str(), Some("main"));
|
|
assert_eq!(main_content["id"].as_str(), Some("content"));
|
|
assert!(main_content["word_count"].as_u64().unwrap() > 0, "Should have word count");
|
|
|
|
// Check meta
|
|
assert_eq!(
|
|
structure["meta"]["description"].as_str(),
|
|
Some("A test page for sitemap analysis")
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
#[ignore] // Requires Chrome to be installed
|
|
fn test_sitemap_robots_txt_parsing() {
|
|
env_logger::try_init().ok();
|
|
|
|
// This test verifies that we can parse sitemap references from robots.txt
|
|
// We'll test with a known site that has a proper robots.txt
|
|
let session = BrowserSession::launch(LaunchOptions::new().headless(true))
|
|
.expect("Failed to launch browser");
|
|
|
|
let tool = SitemapTool::default();
|
|
let mut context = ToolContext::new(&session);
|
|
|
|
// Test with a site known to have robots.txt with sitemap
|
|
let params = SitemapParams {
|
|
url: "https://www.google.com".to_string(),
|
|
analyze_structure: false,
|
|
max_pages: 1,
|
|
};
|
|
|
|
let result = tool.execute_typed(params, &mut context).expect("Failed to execute sitemap tool");
|
|
|
|
assert!(result.success, "Tool execution should succeed");
|
|
|
|
let data = result.data.unwrap();
|
|
info!("Google sitemap result: {}", serde_json::to_string_pretty(&data).unwrap());
|
|
|
|
// Google should have robots.txt
|
|
// Note: This test may be flaky depending on network conditions
|
|
assert_eq!(data["base_url"].as_str(), Some("https://www.google.com"));
|
|
}
|
|
|
|
#[test]
|
|
#[ignore] // Requires Chrome to be installed
|
|
fn test_sitemap_max_pages_limit() {
|
|
env_logger::try_init().ok();
|
|
|
|
let session = BrowserSession::launch(LaunchOptions::new().headless(true))
|
|
.expect("Failed to launch browser");
|
|
|
|
let tool = SitemapTool::default();
|
|
let mut context = ToolContext::new(&session);
|
|
|
|
let params = SitemapParams {
|
|
url: "https://example.com".to_string(),
|
|
analyze_structure: true,
|
|
max_pages: 1, // Limit to 1 page
|
|
};
|
|
|
|
let result = tool.execute_typed(params, &mut context).expect("Failed to execute sitemap tool");
|
|
|
|
assert!(result.success);
|
|
|
|
let data = result.data.unwrap();
|
|
let page_structures = data["page_structures"].as_array().unwrap();
|
|
|
|
// Should not exceed max_pages
|
|
assert!(
|
|
page_structures.len() <= 1,
|
|
"Should not analyze more than max_pages (1)"
|
|
);
|
|
}
|