AI Newsletter Digest improvements: fixed QP soft line break decoding, URL extraction, and content cleaning
This commit is contained in:
326
skills/fast-browser-use/tests/sitemap_integration.rs
Normal file
326
skills/fast-browser-use/tests/sitemap_integration.rs
Normal file
@@ -0,0 +1,326 @@
|
||||
//! Integration tests for the sitemap analysis tool
|
||||
|
||||
use browser_use::{
|
||||
BrowserSession, LaunchOptions,
|
||||
tools::{sitemap::{SitemapTool, SitemapParams}, Tool, ToolContext},
|
||||
};
|
||||
use log::info;
|
||||
|
||||
#[test]
|
||||
#[ignore] // Requires Chrome to be installed
|
||||
fn test_sitemap_tool_basic() {
|
||||
env_logger::try_init().ok();
|
||||
|
||||
let session = BrowserSession::launch(LaunchOptions::new().headless(true))
|
||||
.expect("Failed to launch browser");
|
||||
|
||||
// Test with a real site that has a sitemap (example.com is simple)
|
||||
let tool = SitemapTool::default();
|
||||
let mut context = ToolContext::new(&session);
|
||||
|
||||
let params = SitemapParams {
|
||||
url: "https://example.com".to_string(),
|
||||
analyze_structure: false,
|
||||
max_pages: 5,
|
||||
};
|
||||
|
||||
let result = tool.execute_typed(params, &mut context).expect("Failed to execute sitemap tool");
|
||||
|
||||
assert!(result.success, "Tool execution should succeed");
|
||||
assert!(result.data.is_some(), "Should have result data");
|
||||
|
||||
let data = result.data.unwrap();
|
||||
info!("Sitemap result: {}", serde_json::to_string_pretty(&data).unwrap());
|
||||
|
||||
// Verify basic structure
|
||||
assert!(data["base_url"].as_str().is_some());
|
||||
assert!(data["sitemaps"].is_array());
|
||||
assert!(data["pages"].is_array());
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[ignore] // Requires Chrome to be installed
|
||||
fn test_sitemap_tool_with_structure_analysis() {
|
||||
env_logger::try_init().ok();
|
||||
|
||||
let session = BrowserSession::launch(LaunchOptions::new().headless(true))
|
||||
.expect("Failed to launch browser");
|
||||
|
||||
let tool = SitemapTool::default();
|
||||
let mut context = ToolContext::new(&session);
|
||||
|
||||
// Use a site that we know has structure
|
||||
let params = SitemapParams {
|
||||
url: "https://example.com".to_string(),
|
||||
analyze_structure: true,
|
||||
max_pages: 2,
|
||||
};
|
||||
|
||||
let result = tool.execute_typed(params, &mut context).expect("Failed to execute sitemap tool");
|
||||
|
||||
assert!(result.success, "Tool execution should succeed");
|
||||
assert!(result.data.is_some(), "Should have result data");
|
||||
|
||||
let data = result.data.unwrap();
|
||||
info!("Sitemap with structure: {}", serde_json::to_string_pretty(&data).unwrap());
|
||||
|
||||
// Verify structure analysis was performed
|
||||
let page_structures = data["page_structures"].as_array();
|
||||
assert!(page_structures.is_some(), "Should have page_structures array");
|
||||
|
||||
// If any pages were analyzed, check the structure format
|
||||
if let Some(structures) = page_structures {
|
||||
if !structures.is_empty() {
|
||||
let first = &structures[0];
|
||||
assert!(first["url"].as_str().is_some(), "Page structure should have url");
|
||||
assert!(first["title"].as_str().is_some(), "Page structure should have title");
|
||||
assert!(first["headings"].is_array(), "Page structure should have headings array");
|
||||
assert!(first["nav_links"].is_array(), "Page structure should have nav_links array");
|
||||
assert!(first["sections"].is_array(), "Page structure should have sections array");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[ignore] // Requires Chrome to be installed
|
||||
fn test_sitemap_analyze_function() {
|
||||
env_logger::try_init().ok();
|
||||
|
||||
let session = BrowserSession::launch(LaunchOptions::new().headless(true))
|
||||
.expect("Failed to launch browser");
|
||||
|
||||
// Test the standalone analyze_sitemap function
|
||||
let result = browser_use::tools::sitemap::analyze_sitemap(
|
||||
&session,
|
||||
"https://example.com",
|
||||
true,
|
||||
2,
|
||||
).expect("Failed to analyze sitemap");
|
||||
|
||||
info!("Analyze sitemap result: {:?}", result);
|
||||
|
||||
assert_eq!(result.base_url, "https://example.com");
|
||||
|
||||
// If structure analysis was performed on homepage
|
||||
if !result.page_structures.is_empty() {
|
||||
let homepage = &result.page_structures[0];
|
||||
assert!(!homepage.url.is_empty(), "Should have URL");
|
||||
assert!(!homepage.title.is_empty(), "Should have title");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[ignore] // Requires Chrome to be installed
|
||||
fn test_sitemap_page_structure_extraction() {
|
||||
env_logger::try_init().ok();
|
||||
|
||||
let session = BrowserSession::launch(LaunchOptions::new().headless(true))
|
||||
.expect("Failed to launch browser");
|
||||
|
||||
// Create a test page with known structure using data: URL
|
||||
let test_html = r#"
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<title>Test Page</title>
|
||||
<meta name="description" content="A test page for sitemap analysis">
|
||||
<meta name="keywords" content="test, sitemap, analysis">
|
||||
<link rel="canonical" href="https://example.com/test">
|
||||
</head>
|
||||
<body>
|
||||
<header>
|
||||
<nav>
|
||||
<a href="/home">Home</a>
|
||||
<a href="/about">About</a>
|
||||
<a href="/contact">Contact</a>
|
||||
</nav>
|
||||
</header>
|
||||
<main id="content" role="main">
|
||||
<h1>Welcome to the Test Page</h1>
|
||||
<section>
|
||||
<h2>Section One</h2>
|
||||
<p>Some content here with multiple words to count.</p>
|
||||
</section>
|
||||
<section>
|
||||
<h2>Section Two</h2>
|
||||
<p>More content in this section.</p>
|
||||
</section>
|
||||
</main>
|
||||
<aside>
|
||||
<h3>Sidebar</h3>
|
||||
</aside>
|
||||
<footer>
|
||||
<p>Footer content</p>
|
||||
</footer>
|
||||
</body>
|
||||
</html>
|
||||
"#;
|
||||
|
||||
// Navigate to the test page
|
||||
let data_url = format!("data:text/html,{}", urlencoding::encode(test_html));
|
||||
session.navigate(&data_url).expect("Failed to navigate");
|
||||
session.wait_for_navigation().expect("Failed to wait for navigation");
|
||||
|
||||
std::thread::sleep(std::time::Duration::from_millis(500));
|
||||
|
||||
// Extract structure using JavaScript evaluation directly
|
||||
let tab = session.tab().expect("Failed to get tab");
|
||||
let structure_js = r#"
|
||||
(function() {
|
||||
var structure = {
|
||||
url: window.location.href,
|
||||
title: document.title,
|
||||
headings: [],
|
||||
nav_links: [],
|
||||
sections: [],
|
||||
main_content: null,
|
||||
meta: {}
|
||||
};
|
||||
|
||||
var headings = document.querySelectorAll('h1, h2, h3, h4, h5, h6');
|
||||
for (var i = 0; i < headings.length && i < 50; i++) {
|
||||
structure.headings.push({
|
||||
level: parseInt(headings[i].tagName.charAt(1)),
|
||||
text: headings[i].innerText.trim().substring(0, 200)
|
||||
});
|
||||
}
|
||||
|
||||
var navs = document.querySelectorAll('nav a, header a, [role="navigation"] a');
|
||||
var seenLinks = new Set();
|
||||
for (var i = 0; i < navs.length && structure.nav_links.length < 30; i++) {
|
||||
var href = navs[i].getAttribute('href');
|
||||
var text = navs[i].innerText.trim();
|
||||
if (href && text && !seenLinks.has(href)) {
|
||||
seenLinks.add(href);
|
||||
structure.nav_links.push({ text: text.substring(0, 100), href: href });
|
||||
}
|
||||
}
|
||||
|
||||
var sections = document.querySelectorAll('main, article, section, aside, footer');
|
||||
for (var i = 0; i < sections.length && i < 20; i++) {
|
||||
var el = sections[i];
|
||||
structure.sections.push({
|
||||
tag: el.tagName.toLowerCase(),
|
||||
id: el.id || null,
|
||||
class: el.className ? el.className.substring(0, 100) : null,
|
||||
role: el.getAttribute('role') || null
|
||||
});
|
||||
}
|
||||
|
||||
var main = document.querySelector('main, [role="main"], #main, #content, .main-content');
|
||||
if (main) {
|
||||
structure.main_content = {
|
||||
tag: main.tagName.toLowerCase(),
|
||||
id: main.id || null,
|
||||
word_count: main.innerText.split(/\s+/).length
|
||||
};
|
||||
}
|
||||
|
||||
var metaDesc = document.querySelector('meta[name="description"]');
|
||||
if (metaDesc) structure.meta.description = metaDesc.getAttribute('content');
|
||||
|
||||
return JSON.stringify(structure);
|
||||
})()
|
||||
"#;
|
||||
|
||||
let result = tab.evaluate(structure_js, false).expect("Failed to evaluate JS");
|
||||
let json_str = result.value.unwrap();
|
||||
let json_str = json_str.as_str().unwrap();
|
||||
|
||||
let structure: serde_json::Value = serde_json::from_str(json_str).expect("Failed to parse structure");
|
||||
|
||||
info!("Page structure: {}", serde_json::to_string_pretty(&structure).unwrap());
|
||||
|
||||
// Verify structure extraction
|
||||
assert_eq!(structure["title"].as_str(), Some("Test Page"));
|
||||
|
||||
// Check headings
|
||||
let headings = structure["headings"].as_array().unwrap();
|
||||
assert!(headings.len() >= 3, "Should have at least 3 headings (h1, h2, h2, h3)");
|
||||
assert_eq!(headings[0]["level"].as_u64(), Some(1));
|
||||
assert_eq!(headings[0]["text"].as_str(), Some("Welcome to the Test Page"));
|
||||
|
||||
// Check nav links
|
||||
let nav_links = structure["nav_links"].as_array().unwrap();
|
||||
assert_eq!(nav_links.len(), 3, "Should have 3 nav links");
|
||||
|
||||
// Check sections
|
||||
let sections = structure["sections"].as_array().unwrap();
|
||||
assert!(!sections.is_empty(), "Should have sections");
|
||||
|
||||
// Check main content
|
||||
let main_content = &structure["main_content"];
|
||||
assert!(main_content.is_object(), "Should have main_content");
|
||||
assert_eq!(main_content["tag"].as_str(), Some("main"));
|
||||
assert_eq!(main_content["id"].as_str(), Some("content"));
|
||||
assert!(main_content["word_count"].as_u64().unwrap() > 0, "Should have word count");
|
||||
|
||||
// Check meta
|
||||
assert_eq!(
|
||||
structure["meta"]["description"].as_str(),
|
||||
Some("A test page for sitemap analysis")
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[ignore] // Requires Chrome to be installed
|
||||
fn test_sitemap_robots_txt_parsing() {
|
||||
env_logger::try_init().ok();
|
||||
|
||||
// This test verifies that we can parse sitemap references from robots.txt
|
||||
// We'll test with a known site that has a proper robots.txt
|
||||
let session = BrowserSession::launch(LaunchOptions::new().headless(true))
|
||||
.expect("Failed to launch browser");
|
||||
|
||||
let tool = SitemapTool::default();
|
||||
let mut context = ToolContext::new(&session);
|
||||
|
||||
// Test with a site known to have robots.txt with sitemap
|
||||
let params = SitemapParams {
|
||||
url: "https://www.google.com".to_string(),
|
||||
analyze_structure: false,
|
||||
max_pages: 1,
|
||||
};
|
||||
|
||||
let result = tool.execute_typed(params, &mut context).expect("Failed to execute sitemap tool");
|
||||
|
||||
assert!(result.success, "Tool execution should succeed");
|
||||
|
||||
let data = result.data.unwrap();
|
||||
info!("Google sitemap result: {}", serde_json::to_string_pretty(&data).unwrap());
|
||||
|
||||
// Google should have robots.txt
|
||||
// Note: This test may be flaky depending on network conditions
|
||||
assert_eq!(data["base_url"].as_str(), Some("https://www.google.com"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[ignore] // Requires Chrome to be installed
|
||||
fn test_sitemap_max_pages_limit() {
|
||||
env_logger::try_init().ok();
|
||||
|
||||
let session = BrowserSession::launch(LaunchOptions::new().headless(true))
|
||||
.expect("Failed to launch browser");
|
||||
|
||||
let tool = SitemapTool::default();
|
||||
let mut context = ToolContext::new(&session);
|
||||
|
||||
let params = SitemapParams {
|
||||
url: "https://example.com".to_string(),
|
||||
analyze_structure: true,
|
||||
max_pages: 1, // Limit to 1 page
|
||||
};
|
||||
|
||||
let result = tool.execute_typed(params, &mut context).expect("Failed to execute sitemap tool");
|
||||
|
||||
assert!(result.success);
|
||||
|
||||
let data = result.data.unwrap();
|
||||
let page_structures = data["page_structures"].as_array().unwrap();
|
||||
|
||||
// Should not exceed max_pages
|
||||
assert!(
|
||||
page_structures.len() <= 1,
|
||||
"Should not analyze more than max_pages (1)"
|
||||
);
|
||||
}
|
||||
Reference in New Issue
Block a user