openclaw-backups/skills/fast-browser-use/tests/sitemap_integration.rs

//! Integration tests for the sitemap analysis tool

use browser_use::{
    BrowserSession, LaunchOptions,
    tools::{sitemap::{SitemapTool, SitemapParams}, Tool, ToolContext},
};
use log::info;

#[test]
#[ignore] // Requires Chrome to be installed
fn test_sitemap_tool_basic() {
    env_logger::try_init().ok();

    let session = BrowserSession::launch(LaunchOptions::new().headless(true))
        .expect("Failed to launch browser");

    // Test with a real site that has a sitemap (example.com is simple)
    let tool = SitemapTool::default();
    let mut context = ToolContext::new(&session);

    let params = SitemapParams {
        url: "https://example.com".to_string(),
        analyze_structure: false,
        max_pages: 5,
    };

    let result = tool.execute_typed(params, &mut context).expect("Failed to execute sitemap tool");

    assert!(result.success, "Tool execution should succeed");
    assert!(result.data.is_some(), "Should have result data");

    let data = result.data.unwrap();
    info!("Sitemap result: {}", serde_json::to_string_pretty(&data).unwrap());

    // Verify basic structure
    assert!(data["base_url"].as_str().is_some());
    assert!(data["sitemaps"].is_array());
    assert!(data["pages"].is_array());
}

#[test]
#[ignore] // Requires Chrome to be installed
fn test_sitemap_tool_with_structure_analysis() {
    env_logger::try_init().ok();

    let session = BrowserSession::launch(LaunchOptions::new().headless(true))
        .expect("Failed to launch browser");

    let tool = SitemapTool::default();
    let mut context = ToolContext::new(&session);

    // Use a site that we know has structure
    let params = SitemapParams {
        url: "https://example.com".to_string(),
        analyze_structure: true,
        max_pages: 2,
    };

    let result = tool.execute_typed(params, &mut context).expect("Failed to execute sitemap tool");

    assert!(result.success, "Tool execution should succeed");
    assert!(result.data.is_some(), "Should have result data");

    let data = result.data.unwrap();
    info!("Sitemap with structure: {}", serde_json::to_string_pretty(&data).unwrap());

    // Verify structure analysis was performed
    let page_structures = data["page_structures"].as_array();
    assert!(page_structures.is_some(), "Should have page_structures array");

    // If any pages were analyzed, check the structure format
    if let Some(structures) = page_structures {
        if !structures.is_empty() {
            let first = &structures[0];
            assert!(first["url"].as_str().is_some(), "Page structure should have url");
            assert!(first["title"].as_str().is_some(), "Page structure should have title");
            assert!(first["headings"].is_array(), "Page structure should have headings array");
            assert!(first["nav_links"].is_array(), "Page structure should have nav_links array");
            assert!(first["sections"].is_array(), "Page structure should have sections array");
        }
    }
}

#[test]
#[ignore] // Requires Chrome to be installed
fn test_sitemap_analyze_function() {
    env_logger::try_init().ok();

    let session = BrowserSession::launch(LaunchOptions::new().headless(true))
        .expect("Failed to launch browser");

    // Test the standalone analyze_sitemap function
    let result = browser_use::tools::sitemap::analyze_sitemap(
        &session,
        "https://example.com",
        true,
        2,
    ).expect("Failed to analyze sitemap");

    info!("Analyze sitemap result: {:?}", result);

    assert_eq!(result.base_url, "https://example.com");

    // If structure analysis was performed on homepage
    if !result.page_structures.is_empty() {
        let homepage = &result.page_structures[0];
        assert!(!homepage.url.is_empty(), "Should have URL");
        assert!(!homepage.title.is_empty(), "Should have title");
    }
}

#[test]
#[ignore] // Requires Chrome to be installed
fn test_sitemap_page_structure_extraction() {
    env_logger::try_init().ok();

    let session = BrowserSession::launch(LaunchOptions::new().headless(true))
        .expect("Failed to launch browser");

    // Create a test page with known structure using data: URL
    let test_html = r#"
        <!DOCTYPE html>
        <html>
        <head>
            <title>Test Page</title>
            <meta name="description" content="A test page for sitemap analysis">
            <meta name="keywords" content="test, sitemap, analysis">
            <link rel="canonical" href="https://example.com/test">
        </head>
        <body>
            <header>
                <nav>
                    <a href="/home">Home</a>
                    <a href="/about">About</a>
                    <a href="/contact">Contact</a>
                </nav>
            </header>
            <main id="content" role="main">
                <h1>Welcome to the Test Page</h1>
                <section>
                    <h2>Section One</h2>
                    <p>Some content here with multiple words to count.</p>
                </section>
                <section>
                    <h2>Section Two</h2>
                    <p>More content in this section.</p>
                </section>
            </main>
            <aside>
                <h3>Sidebar</h3>
            </aside>
            <footer>
                <p>Footer content</p>
            </footer>
        </body>
        </html>
    "#;

    // Navigate to the test page
    let data_url = format!("data:text/html,{}", urlencoding::encode(test_html));
    session.navigate(&data_url).expect("Failed to navigate");
    session.wait_for_navigation().expect("Failed to wait for navigation");

    std::thread::sleep(std::time::Duration::from_millis(500));

    // Extract structure using JavaScript evaluation directly
    let tab = session.tab().expect("Failed to get tab");
    let structure_js = r#"
        (function() {
            var structure = {
                url: window.location.href,
                title: document.title,
                headings: [],
                nav_links: [],
                sections: [],
                main_content: null,
                meta: {}
            };

            var headings = document.querySelectorAll('h1, h2, h3, h4, h5, h6');
            for (var i = 0; i < headings.length && i < 50; i++) {
                structure.headings.push({
                    level: parseInt(headings[i].tagName.charAt(1)),
                    text: headings[i].innerText.trim().substring(0, 200)
                });
            }

            var navs = document.querySelectorAll('nav a, header a, [role="navigation"] a');
            var seenLinks = new Set();
            for (var i = 0; i < navs.length && structure.nav_links.length < 30; i++) {
                var href = navs[i].getAttribute('href');
                var text = navs[i].innerText.trim();
                if (href && text && !seenLinks.has(href)) {
                    seenLinks.add(href);
                    structure.nav_links.push({ text: text.substring(0, 100), href: href });
                }
            }

            var sections = document.querySelectorAll('main, article, section, aside, footer');
            for (var i = 0; i < sections.length && i < 20; i++) {
                var el = sections[i];
                structure.sections.push({
                    tag: el.tagName.toLowerCase(),
                    id: el.id || null,
                    class: el.className ? el.className.substring(0, 100) : null,
                    role: el.getAttribute('role') || null
                });
            }

            var main = document.querySelector('main, [role="main"], #main, #content, .main-content');
            if (main) {
                structure.main_content = {
                    tag: main.tagName.toLowerCase(),
                    id: main.id || null,
                    word_count: main.innerText.split(/\s+/).length
                };
            }

            var metaDesc = document.querySelector('meta[name="description"]');
            if (metaDesc) structure.meta.description = metaDesc.getAttribute('content');

            return JSON.stringify(structure);
        })()
    "#;

    let result = tab.evaluate(structure_js, false).expect("Failed to evaluate JS");
    let json_str = result.value.unwrap();
    let json_str = json_str.as_str().unwrap();

    let structure: serde_json::Value = serde_json::from_str(json_str).expect("Failed to parse structure");

    info!("Page structure: {}", serde_json::to_string_pretty(&structure).unwrap());

    // Verify structure extraction
    assert_eq!(structure["title"].as_str(), Some("Test Page"));

    // Check headings
    let headings = structure["headings"].as_array().unwrap();
    assert!(headings.len() >= 3, "Should have at least 3 headings (h1, h2, h2, h3)");
    assert_eq!(headings[0]["level"].as_u64(), Some(1));
    assert_eq!(headings[0]["text"].as_str(), Some("Welcome to the Test Page"));

    // Check nav links
    let nav_links = structure["nav_links"].as_array().unwrap();
    assert_eq!(nav_links.len(), 3, "Should have 3 nav links");

    // Check sections
    let sections = structure["sections"].as_array().unwrap();
    assert!(!sections.is_empty(), "Should have sections");

    // Check main content
    let main_content = &structure["main_content"];
    assert!(main_content.is_object(), "Should have main_content");
    assert_eq!(main_content["tag"].as_str(), Some("main"));
    assert_eq!(main_content["id"].as_str(), Some("content"));
    assert!(main_content["word_count"].as_u64().unwrap() > 0, "Should have word count");

    // Check meta
    assert_eq!(
        structure["meta"]["description"].as_str(),
        Some("A test page for sitemap analysis")
    );
}

#[test]
#[ignore] // Requires Chrome to be installed
fn test_sitemap_robots_txt_parsing() {
    env_logger::try_init().ok();

    // This test verifies that we can parse sitemap references from robots.txt
    // We'll test with a known site that has a proper robots.txt
    let session = BrowserSession::launch(LaunchOptions::new().headless(true))
        .expect("Failed to launch browser");

    let tool = SitemapTool::default();
    let mut context = ToolContext::new(&session);

    // Test with a site known to have robots.txt with sitemap
    let params = SitemapParams {
        url: "https://www.google.com".to_string(),
        analyze_structure: false,
        max_pages: 1,
    };

    let result = tool.execute_typed(params, &mut context).expect("Failed to execute sitemap tool");

    assert!(result.success, "Tool execution should succeed");

    let data = result.data.unwrap();
    info!("Google sitemap result: {}", serde_json::to_string_pretty(&data).unwrap());

    // Google should have robots.txt
    // Note: This test may be flaky depending on network conditions
    assert_eq!(data["base_url"].as_str(), Some("https://www.google.com"));
}

#[test]
#[ignore] // Requires Chrome to be installed
fn test_sitemap_max_pages_limit() {
    env_logger::try_init().ok();

    let session = BrowserSession::launch(LaunchOptions::new().headless(true))
        .expect("Failed to launch browser");

    let tool = SitemapTool::default();
    let mut context = ToolContext::new(&session);

    let params = SitemapParams {
        url: "https://example.com".to_string(),
        analyze_structure: true,
        max_pages: 1, // Limit to 1 page
    };

    let result = tool.execute_typed(params, &mut context).expect("Failed to execute sitemap tool");

    assert!(result.success);

    let data = result.data.unwrap();
    let page_structures = data["page_structures"].as_array().unwrap();

    // Should not exceed max_pages
    assert!(
        page_structures.len() <= 1,
        "Should not analyze more than max_pages (1)"
    );
}