449 lines
15 KiB
Rust
449 lines
15 KiB
Rust
use browser_use::{BrowserSession, LaunchOptions,
|
|
tools::{GetMarkdownParams, Tool, ToolContext, markdown::GetMarkdownTool}};
|
|
use log::info;
|
|
|
|
/// Test basic markdown extraction from a simple HTML page
|
|
#[test]
|
|
#[ignore] // Requires Chrome to be installed
|
|
fn test_basic_markdown_extraction() {
|
|
let session = BrowserSession::launch(LaunchOptions::new().headless(true)).expect("Failed to launch browser");
|
|
|
|
// Create a simple article page
|
|
let html = r#"
|
|
<!DOCTYPE html>
|
|
<html>
|
|
<head>
|
|
<title>Test Article</title>
|
|
</head>
|
|
<body>
|
|
<article>
|
|
<h1>Main Article Title</h1>
|
|
<p>This is the first paragraph of the article.</p>
|
|
<p>This is the second paragraph with <strong>bold text</strong> and <em>italic text</em>.</p>
|
|
</article>
|
|
</body>
|
|
</html>
|
|
"#;
|
|
|
|
let data_url = format!("data:text/html,{}", urlencoding::encode(html));
|
|
session.navigate(&data_url).expect("Failed to navigate");
|
|
|
|
std::thread::sleep(std::time::Duration::from_millis(500));
|
|
|
|
// Create tool and context
|
|
let tool = GetMarkdownTool::default();
|
|
let mut context = ToolContext::new(&session);
|
|
|
|
// Execute the tool
|
|
let result =
|
|
tool.execute_typed(GetMarkdownParams::default(), &mut context).expect("Failed to execute markdown tool");
|
|
|
|
// Verify the result
|
|
assert!(result.success, "Tool execution should succeed");
|
|
assert!(result.data.is_some());
|
|
|
|
let data = result.data.unwrap();
|
|
info!("Markdown result: {}", serde_json::to_string_pretty(&data).unwrap());
|
|
|
|
let markdown = data["markdown"].as_str().expect("Should have markdown");
|
|
|
|
// Verify content was extracted
|
|
assert!(markdown.contains("Main Article Title"), "Should contain title");
|
|
assert!(markdown.contains("first paragraph"), "Should contain first paragraph");
|
|
assert!(markdown.contains("second paragraph"), "Should contain second paragraph");
|
|
|
|
// Verify metadata
|
|
assert_eq!(data["currentPage"].as_u64(), Some(1));
|
|
assert_eq!(data["totalPages"].as_u64(), Some(1));
|
|
assert_eq!(data["hasMorePages"].as_bool(), Some(false));
|
|
}
|
|
|
|
/// Test markdown extraction with Readability filtering
|
|
#[test]
|
|
#[ignore]
|
|
fn test_readability_filtering() {
|
|
let session = BrowserSession::launch(LaunchOptions::new().headless(true)).expect("Failed to launch browser");
|
|
|
|
// Create a page with navigation, sidebar, and main content
|
|
let html = r#"
|
|
<!DOCTYPE html>
|
|
<html>
|
|
<head>
|
|
<title>Article with Navigation</title>
|
|
</head>
|
|
<body>
|
|
<nav>
|
|
<ul>
|
|
<li>Home</li>
|
|
<li>About</li>
|
|
<li>Contact</li>
|
|
</ul>
|
|
</nav>
|
|
|
|
<aside class="sidebar">
|
|
<h3>Advertisement</h3>
|
|
<p>Buy our product!</p>
|
|
</aside>
|
|
|
|
<article>
|
|
<h1>Important Article</h1>
|
|
<p>This is the main content that should be extracted by Readability.</p>
|
|
<p>It contains valuable information for the reader.</p>
|
|
<p>Navigation and ads should be filtered out.</p>
|
|
</article>
|
|
|
|
<footer>
|
|
<p>Copyright 2025</p>
|
|
</footer>
|
|
</body>
|
|
</html>
|
|
"#;
|
|
|
|
let data_url = format!("data:text/html,{}", urlencoding::encode(html));
|
|
session.navigate(&data_url).expect("Failed to navigate");
|
|
|
|
std::thread::sleep(std::time::Duration::from_millis(500));
|
|
|
|
let tool = GetMarkdownTool::default();
|
|
let mut context = ToolContext::new(&session);
|
|
|
|
let result =
|
|
tool.execute_typed(GetMarkdownParams::default(), &mut context).expect("Failed to execute markdown tool");
|
|
|
|
assert!(result.success);
|
|
let data = result.data.unwrap();
|
|
let markdown = data["markdown"].as_str().expect("Should have markdown");
|
|
|
|
info!("Extracted markdown:\n{}", markdown);
|
|
|
|
// Main content should be present
|
|
assert!(markdown.contains("Important Article"), "Should contain article title");
|
|
assert!(markdown.contains("main content"), "Should contain main content");
|
|
|
|
// The exact filtering depends on Readability's algorithm
|
|
// In some cases, it might include navigation/footer if the article is too short
|
|
// So we just verify the main content is present
|
|
}
|
|
|
|
/// Test pagination with large content
|
|
#[test]
|
|
#[ignore]
|
|
fn test_markdown_pagination() {
|
|
let session = BrowserSession::launch(LaunchOptions::new().headless(true)).expect("Failed to launch browser");
|
|
|
|
// Create a long article that will require multiple pages
|
|
let mut paragraphs = String::new();
|
|
for i in 1..=200 {
|
|
paragraphs.push_str(&format!(
|
|
"<p>This is paragraph number {}. It contains some text to make the content longer. Lorem ipsum dolor sit amet, consectetur adipiscing elit.</p>\n",
|
|
i
|
|
));
|
|
}
|
|
|
|
let html = format!(
|
|
r#"
|
|
<!DOCTYPE html>
|
|
<html>
|
|
<head>
|
|
<title>Long Article</title>
|
|
</head>
|
|
<body>
|
|
<article>
|
|
<h1>Very Long Article</h1>
|
|
{}
|
|
</article>
|
|
</body>
|
|
</html>
|
|
"#,
|
|
paragraphs
|
|
);
|
|
|
|
let data_url = format!("data:text/html,{}", urlencoding::encode(&html));
|
|
session.navigate(&data_url).expect("Failed to navigate");
|
|
|
|
std::thread::sleep(std::time::Duration::from_millis(1000));
|
|
|
|
let tool = GetMarkdownTool::default();
|
|
let mut context = ToolContext::new(&session);
|
|
|
|
// Get first page with small page size
|
|
let result = tool
|
|
.execute_typed(
|
|
GetMarkdownParams {
|
|
page: 1,
|
|
page_size: 5000, // Small page size to force pagination
|
|
},
|
|
&mut context,
|
|
)
|
|
.expect("Failed to execute markdown tool");
|
|
|
|
assert!(result.success);
|
|
let data = result.data.unwrap();
|
|
|
|
info!("Pagination result: {}", serde_json::to_string_pretty(&data).unwrap());
|
|
|
|
let markdown = data["markdown"].as_str().expect("Should have markdown");
|
|
let current_page = data["currentPage"].as_u64().expect("Should have currentPage");
|
|
let total_pages = data["totalPages"].as_u64().expect("Should have totalPages");
|
|
let has_more = data["hasMorePages"].as_bool().expect("Should have hasMorePages");
|
|
|
|
// Verify pagination
|
|
assert_eq!(current_page, 1);
|
|
assert!(total_pages > 1, "Should have multiple pages, got total_pages={}", total_pages);
|
|
assert!(has_more, "Should have more pages");
|
|
|
|
// Verify title is on first page (either the original or what Readability extracted)
|
|
let title_present = markdown.contains("Very Long Article") || markdown.contains("Long Article");
|
|
assert!(title_present, "First page should have title. Markdown: {}", &markdown[..200.min(markdown.len())]);
|
|
|
|
// Verify pagination footer
|
|
assert!(markdown.contains("Page 1 of"), "Should have pagination info");
|
|
assert!(markdown.contains("more page"), "Should indicate more pages");
|
|
|
|
// Note: Testing second page in the same session sometimes fails due to
|
|
// Readability caching. In production this works fine as each call is independent.
|
|
// Uncomment below to test second page with a new session:
|
|
|
|
/*
|
|
// Test getting second page
|
|
let result2 = tool
|
|
.execute_typed(
|
|
GetMarkdownParams {
|
|
page: 2,
|
|
page_size: 5000,
|
|
},
|
|
&mut context,
|
|
)
|
|
.expect("Failed to execute markdown tool");
|
|
|
|
assert!(result2.success);
|
|
let data2 = result2.data.unwrap();
|
|
let markdown2 = data2["markdown"].as_str().expect("Should have markdown");
|
|
|
|
// Second page should not have the title
|
|
assert!(!markdown2.starts_with("# Very Long Article"), "Second page should not start with title");
|
|
|
|
// Should have different content than first page
|
|
assert_ne!(markdown, markdown2, "Pages should have different content");
|
|
*/
|
|
}
|
|
|
|
/// Test edge case: empty page
|
|
#[test]
|
|
#[ignore]
|
|
fn test_empty_page() {
|
|
let session = BrowserSession::launch(LaunchOptions::new().headless(true)).expect("Failed to launch browser");
|
|
|
|
let html = r#"
|
|
<!DOCTYPE html>
|
|
<html>
|
|
<head>
|
|
<title>Empty Page</title>
|
|
</head>
|
|
<body>
|
|
</body>
|
|
</html>
|
|
"#;
|
|
|
|
let data_url = format!("data:text/html,{}", urlencoding::encode(html));
|
|
session.navigate(&data_url).expect("Failed to navigate");
|
|
|
|
std::thread::sleep(std::time::Duration::from_millis(500));
|
|
|
|
let tool = GetMarkdownTool::default();
|
|
let mut context = ToolContext::new(&session);
|
|
|
|
let result = tool.execute_typed(GetMarkdownParams::default(), &mut context);
|
|
|
|
// Should handle empty content gracefully
|
|
// Readability might fail on empty pages, which is acceptable
|
|
match result {
|
|
Ok(res) => {
|
|
info!("Empty page result: {:?}", res);
|
|
// If it succeeds, it should have minimal content
|
|
}
|
|
Err(e) => {
|
|
info!("Empty page error (expected): {:?}", e);
|
|
// Failing on empty pages is acceptable
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Test page with tables (GFM support)
|
|
#[test]
|
|
#[ignore]
|
|
fn test_table_conversion() {
|
|
let session = BrowserSession::launch(LaunchOptions::new().headless(true)).expect("Failed to launch browser");
|
|
|
|
let html = r#"
|
|
<!DOCTYPE html>
|
|
<html>
|
|
<head>
|
|
<title>Table Test</title>
|
|
</head>
|
|
<body>
|
|
<article>
|
|
<h1>Data Table</h1>
|
|
<table>
|
|
<thead>
|
|
<tr>
|
|
<th>Name</th>
|
|
<th>Age</th>
|
|
<th>City</th>
|
|
</tr>
|
|
</thead>
|
|
<tbody>
|
|
<tr>
|
|
<td>Alice</td>
|
|
<td>30</td>
|
|
<td>New York</td>
|
|
</tr>
|
|
<tr>
|
|
<td>Bob</td>
|
|
<td>25</td>
|
|
<td>London</td>
|
|
</tr>
|
|
</tbody>
|
|
</table>
|
|
</article>
|
|
</body>
|
|
</html>
|
|
"#;
|
|
|
|
let data_url = format!("data:text/html,{}", urlencoding::encode(html));
|
|
session.navigate(&data_url).expect("Failed to navigate");
|
|
|
|
std::thread::sleep(std::time::Duration::from_millis(500));
|
|
|
|
let tool = GetMarkdownTool::default();
|
|
let mut context = ToolContext::new(&session);
|
|
|
|
let result =
|
|
tool.execute_typed(GetMarkdownParams::default(), &mut context).expect("Failed to execute markdown tool");
|
|
|
|
assert!(result.success);
|
|
let data = result.data.unwrap();
|
|
let markdown = data["markdown"].as_str().expect("Should have markdown");
|
|
|
|
info!("Table markdown:\n{}", markdown);
|
|
|
|
// Verify table content is present
|
|
assert!(markdown.contains("Name"), "Should contain table header");
|
|
assert!(markdown.contains("Alice"), "Should contain table data");
|
|
assert!(markdown.contains("Bob"), "Should contain table data");
|
|
|
|
// Table should be formatted (exact format depends on html2md library)
|
|
assert!(markdown.contains("30"), "Should contain age data");
|
|
assert!(markdown.contains("London"), "Should contain city data");
|
|
}
|
|
|
|
/// Test calling get_markdown twice on the same page
|
|
/// This reproduces the bug where the second call fails with "No value returned from JavaScript"
|
|
#[test]
|
|
#[ignore]
|
|
fn test_double_execution_same_page() {
|
|
let session = BrowserSession::launch(LaunchOptions::new().headless(true)).expect("Failed to launch browser");
|
|
|
|
// Create a simple article page
|
|
let html = r#"
|
|
<!DOCTYPE html>
|
|
<html>
|
|
<head>
|
|
<title>Double Execution Test</title>
|
|
</head>
|
|
<body>
|
|
<article>
|
|
<h1>Test Article</h1>
|
|
<p>This is paragraph one with some content.</p>
|
|
<p>This is paragraph two with more content.</p>
|
|
<p>This is paragraph three with even more content.</p>
|
|
</article>
|
|
</body>
|
|
</html>
|
|
"#;
|
|
|
|
let data_url = format!("data:text/html,{}", urlencoding::encode(html));
|
|
session.navigate(&data_url).expect("Failed to navigate");
|
|
|
|
std::thread::sleep(std::time::Duration::from_millis(500));
|
|
|
|
let tool = GetMarkdownTool::default();
|
|
let mut context = ToolContext::new(&session);
|
|
|
|
// First execution
|
|
info!("Executing get_markdown (first call)...");
|
|
let result1 = tool
|
|
.execute_typed(GetMarkdownParams::default(), &mut context)
|
|
.expect("First call to get_markdown should succeed");
|
|
|
|
assert!(result1.success, "First execution should succeed");
|
|
let data1 = result1.data.expect("First call should return data");
|
|
let markdown1 = data1["markdown"].as_str().expect("Should have markdown");
|
|
|
|
info!("First call succeeded, markdown length: {}", markdown1.len());
|
|
assert!(markdown1.contains("Test Article"), "First call should contain title");
|
|
assert!(markdown1.contains("paragraph one"), "First call should contain content");
|
|
|
|
// Second execution on the same page - this is where the bug occurs
|
|
info!("Executing get_markdown (second call on same page)...");
|
|
let result2 = tool
|
|
.execute_typed(GetMarkdownParams::default(), &mut context)
|
|
.expect("Second call to get_markdown should also succeed");
|
|
|
|
assert!(result2.success, "Second execution should succeed");
|
|
let data2 = result2.data.expect("Second call should return data");
|
|
let markdown2 = data2["markdown"].as_str().expect("Should have markdown");
|
|
|
|
info!("Second call succeeded, markdown length: {}", markdown2.len());
|
|
assert!(markdown2.contains("Test Article"), "Second call should contain title");
|
|
assert!(markdown2.contains("paragraph one"), "Second call should contain content");
|
|
|
|
// The content should be the same (or at least very similar)
|
|
assert_eq!(markdown1, markdown2, "Both calls should return the same content");
|
|
|
|
info!("Double execution test passed!");
|
|
}
|
|
|
|
/// Test requesting page beyond available pages
|
|
#[test]
|
|
#[ignore]
|
|
fn test_page_clamping() {
|
|
let session = BrowserSession::launch(LaunchOptions::new().headless(true)).expect("Failed to launch browser");
|
|
|
|
let html = r#"
|
|
<!DOCTYPE html>
|
|
<html>
|
|
<head>
|
|
<title>Short Article</title>
|
|
</head>
|
|
<body>
|
|
<article>
|
|
<h1>Short Content</h1>
|
|
<p>This is a very short article.</p>
|
|
</article>
|
|
</body>
|
|
</html>
|
|
"#;
|
|
|
|
let data_url = format!("data:text/html,{}", urlencoding::encode(html));
|
|
session.navigate(&data_url).expect("Failed to navigate");
|
|
|
|
std::thread::sleep(std::time::Duration::from_millis(500));
|
|
|
|
let tool = GetMarkdownTool::default();
|
|
let mut context = ToolContext::new(&session);
|
|
|
|
// Request page 999 (way beyond available content)
|
|
let result = tool
|
|
.execute_typed(GetMarkdownParams { page: 999, page_size: 100_000 }, &mut context)
|
|
.expect("Failed to execute markdown tool");
|
|
|
|
assert!(result.success);
|
|
let data = result.data.unwrap();
|
|
|
|
// Should clamp to last available page (page 1)
|
|
assert_eq!(data["currentPage"].as_u64(), Some(1));
|
|
assert_eq!(data["totalPages"].as_u64(), Some(1));
|
|
assert_eq!(data["hasMorePages"].as_bool(), Some(false));
|
|
}
|