Complete API documentation for Reader.
The recommended way to use Reader. Manages HeroCore lifecycle automatically, reuses connections efficiently, and auto-closes on process exit.
import { ReaderClient } from "@vakra-dev/reader";
const reader = new ReaderClient({ verbose: true });
// Scrape URLs
const result = await reader.scrape({
urls: ["https://example.com"],
formats: ["markdown", "text"],
});
// Crawl a website
const crawlResult = await reader.crawl({
url: "https://example.com",
depth: 2,
});
// Close when done (optional - auto-closes on exit)
await reader.close();new ReaderClient(options?: ReaderClientOptions)| Option | Type | Default | Description |
|---|---|---|---|
verbose |
boolean |
false |
Enable verbose logging |
showChrome |
boolean |
false |
Show browser window for debugging |
browserPool |
BrowserPoolConfig |
- | Browser pool configuration |
proxies |
ProxyConfig[] |
- | List of proxies to rotate through |
proxyRotation |
"round-robin" | "random" |
"round-robin" |
Proxy rotation strategy |
| Option | Type | Default | Description |
|---|---|---|---|
size |
number |
2 |
Number of browser instances |
retireAfterPages |
number |
100 |
Retire browser after N page loads |
retireAfterMinutes |
number |
30 |
Retire browser after N minutes |
maxQueueSize |
number |
100 |
Maximum pending requests in queue |
Pre-initialize HeroCore. Called automatically on first scrape/crawl.
await reader.start(): Promise<void>Scrape one or more URLs.
const result = await reader.scrape(options): Promise<ScrapeResult>See ScrapeOptions for available options.
Crawl a website to discover pages.
const result = await reader.crawl(options): Promise<CrawlResult>See CrawlOptions for available options.
Check if the client is initialized and ready.
reader.isReady(): booleanClose the client and release resources.
await reader.close(): Promise<void>For advanced use cases where you need custom HeroCore management, you can use the direct functions. Note that without connectionToCore, each call spawns a new HeroCore instance which is less efficient.
Scrape one or more URLs and return content in specified formats.
import { scrape } from "@vakra-dev/reader";
const result = await scrape({
urls: ["https://example.com"],
formats: ["markdown", "text"],
});| Name | Type | Required | Default | Description |
|---|---|---|---|---|
urls |
string[] |
Yes | - | Array of URLs to scrape |
formats |
FormatType[] |
No | ["markdown"] |
Output formats |
includeMetadata |
boolean |
No | true |
Include metadata in formatted output |
userAgent |
string |
No | - | Custom user agent string |
timeoutMs |
number |
No | 30000 |
Request timeout in milliseconds |
includePatterns |
string[] |
No | [] |
URL patterns to include (regex) |
excludePatterns |
string[] |
No | [] |
URL patterns to exclude (regex) |
batchConcurrency |
number |
No | 1 |
URLs to process in parallel |
batchTimeoutMs |
number |
No | 300000 |
Total batch timeout |
maxRetries |
number |
No | 2 |
Retry attempts for failed URLs |
onProgress |
ProgressCallback |
No | - | Progress callback function |
proxy |
ProxyConfig |
No | - | Proxy configuration |
waitForSelector |
string |
No | - | CSS selector to wait for |
verbose |
boolean |
No | false |
Enable verbose logging |
showChrome |
boolean |
No | false |
Show browser window |
connectionToCore |
any |
No | - | Shared Hero Core connection |
Promise<ScrapeResult>
interface ScrapeResult {
data: WebsiteScrapeResult[];
batchMetadata: BatchMetadata;
}// Using ReaderClient (recommended)
const reader = new ReaderClient();
const result = await reader.scrape({
urls: ["https://example.com", "https://example.org"],
formats: ["markdown", "json"],
batchConcurrency: 2,
onProgress: ({ completed, total, currentUrl }) => {
console.log(`[${completed}/${total}] ${currentUrl}`);
},
});
for (const site of result.data) {
console.log("URL:", site.metadata.baseUrl);
console.log("Markdown:", site.markdown?.substring(0, 200));
}
await reader.close();Crawl a website to discover pages, optionally scraping their content.
// Using ReaderClient (recommended)
import { ReaderClient } from "@vakra-dev/reader";
const reader = new ReaderClient();
const result = await reader.crawl({
url: "https://example.com",
depth: 2,
maxPages: 20,
scrape: true,
});
await reader.close();| Name | Type | Required | Default | Description |
|---|---|---|---|---|
url |
string |
Yes | - | Seed URL to start crawling |
depth |
number |
No | 1 |
Maximum crawl depth |
maxPages |
number |
No | 20 |
Maximum pages to discover |
scrape |
boolean |
No | false |
Also scrape discovered pages |
delayMs |
number |
No | 1000 |
Delay between requests |
timeoutMs |
number |
No | - | Total crawl timeout |
includePatterns |
string[] |
No | - | URL patterns to include |
excludePatterns |
string[] |
No | - | URL patterns to exclude |
formats |
FormatType[] |
No | ["markdown", "html"] |
Output formats when scraping |
scrapeConcurrency |
number |
No | 2 |
Scraping parallelism |
proxy |
ProxyConfig |
No | - | Proxy configuration |
userAgent |
string |
No | - | Custom user agent |
verbose |
boolean |
No | false |
Enable verbose logging |
showChrome |
boolean |
No | false |
Show browser window |
connectionToCore |
any |
No | - | Shared Hero Core connection |
Promise<CrawlResult>
interface CrawlResult {
urls: CrawlUrl[];
scraped?: ScrapeResult;
metadata: CrawlMetadata;
}const reader = new ReaderClient();
const result = await reader.crawl({
url: "https://docs.example.com",
depth: 3,
maxPages: 50,
includePatterns: ["docs/*"],
excludePatterns: ["docs/archive/*"],
scrape: true,
});
console.log(`Discovered ${result.urls.length} pages`);
result.urls.forEach((page) => {
console.log(`- ${page.title}: ${page.url}`);
});
if (result.scraped) {
console.log(`Scraped ${result.scraped.batchMetadata.successfulUrls} pages`);
}
await reader.close();interface ScrapeOptions {
urls: string[];
formats?: Array<"markdown" | "html" | "json" | "text">;
includeMetadata?: boolean;
userAgent?: string;
timeoutMs?: number;
includePatterns?: string[];
excludePatterns?: string[];
batchConcurrency?: number;
batchTimeoutMs?: number;
maxRetries?: number;
onProgress?: (progress: ProgressInfo) => void;
proxy?: ProxyConfig;
waitForSelector?: string;
verbose?: boolean;
showChrome?: boolean;
connectionToCore?: any;
}interface CrawlOptions {
url: string;
depth?: number;
maxPages?: number;
scrape?: boolean;
delayMs?: number;
timeoutMs?: number;
includePatterns?: string[];
excludePatterns?: string[];
formats?: Array<"markdown" | "html" | "json" | "text">;
scrapeConcurrency?: number;
proxy?: ProxyConfig;
userAgent?: string;
verbose?: boolean;
showChrome?: boolean;
connectionToCore?: any;
}interface ProxyConfig {
url?: string;
type?: "datacenter" | "residential";
host?: string;
port?: number;
username?: string;
password?: string;
country?: string;
}interface ScrapeResult {
data: WebsiteScrapeResult[];
batchMetadata: BatchMetadata;
}interface WebsiteScrapeResult {
markdown?: string;
html?: string;
json?: string;
text?: string;
metadata: {
baseUrl: string;
totalPages: number;
scrapedAt: string;
duration: number;
website: WebsiteMetadata;
proxy?: ProxyMetadata; // Included when proxy pooling is used
};
}interface ProxyMetadata {
host: string;
port: number;
country?: string; // If geo-targeting was used
}interface BatchMetadata {
totalUrls: number;
successfulUrls: number;
failedUrls: number;
scrapedAt: string;
totalDuration: number;
errors?: Array<{ url: string; error: string }>;
}interface CrawlResult {
urls: CrawlUrl[];
scraped?: ScrapeResult;
metadata: CrawlMetadata;
}interface CrawlUrl {
url: string;
title: string;
description: string | null;
}interface CrawlMetadata {
totalUrls: number;
maxDepth: number;
totalDuration: number;
seedUrl: string;
}interface WebsiteMetadata {
title: string | null;
description: string | null;
author: string | null;
language: string | null;
charset: string | null;
favicon: string | null;
image: string | null;
canonical: string | null;
keywords: string[] | null;
robots: string | null;
themeColor: string | null;
openGraph: {
title: string | null;
description: string | null;
type: string | null;
url: string | null;
image: string | null;
siteName: string | null;
locale: string | null;
} | null;
twitter: {
card: string | null;
site: string | null;
creator: string | null;
title: string | null;
description: string | null;
image: string | null;
} | null;
}interface ProgressInfo {
completed: number;
total: number;
currentUrl: string;
}Manages a pool of Hero browser instances for efficient scraping.
import { BrowserPool } from "@vakra-dev/reader";
const pool = new BrowserPool({ size: 5 });
await pool.initialize();
const result = await pool.withBrowser(async (hero) => {
await hero.goto("https://example.com");
return await hero.document.title;
});
await pool.shutdown();new BrowserPool(config?: PoolConfig)| Parameter | Type | Default | Description |
|---|---|---|---|
size |
number |
2 |
Number of browser instances |
retireAfterPages |
number |
100 |
Recycle after N pages |
retireAfterMinutes |
number |
30 |
Recycle after N minutes |
maxQueueSize |
number |
100 |
Maximum pending requests |
healthCheckIntervalMs |
number |
300000 |
Health check interval |
Initialize the browser pool.
await pool.initialize(): Promise<void>Execute a function with an acquired browser, automatically releasing it after.
await pool.withBrowser<T>(fn: (hero: Hero) => Promise<T>): Promise<T>Manually acquire a browser instance. Must be paired with release().
const hero = await pool.acquire(): Promise<Hero>Release a browser instance back to the pool.
await pool.release(hero: Hero): Promise<void>Check the health of all pool instances.
const health = await pool.healthCheck(): Promise<HealthCheckResult>Get current pool statistics.
const stats = pool.getStats(): PoolStatsShutdown all browser instances.
await pool.shutdown(): Promise<void>Detect if a Cloudflare challenge is present on the current page.
import { detectChallenge } from "@vakra-dev/reader";
const detection = await detectChallenge(hero);
if (detection.isChallenge) {
console.log("Challenge type:", detection.type);
console.log("Signals:", detection.signals);
}interface ChallengeDetection {
isChallenge: boolean;
type: "js_challenge" | "turnstile" | "captcha" | "blocked" | null;
signals: Array<{
type: "dom" | "text" | "url";
value: string;
}>;
}Wait for a Cloudflare challenge to be resolved.
import { waitForChallengeResolution } from "@vakra-dev/reader";
const result = await waitForChallengeResolution(hero, {
maxWaitMs: 45000,
pollIntervalMs: 500,
verbose: true,
initialUrl: await hero.url,
});
if (result.resolved) {
console.log(`Resolved via ${result.method} in ${result.waitedMs}ms`);
}| Option | Type | Default | Description |
|---|---|---|---|
maxWaitMs |
number |
45000 |
Maximum wait time |
pollIntervalMs |
number |
500 |
Polling interval |
verbose |
boolean |
false |
Enable logging |
initialUrl |
string |
- | Starting URL for redirect detection |
interface ChallengeResolutionResult {
resolved: boolean;
method?: "redirect" | "element_removal";
waitedMs: number;
}Convert scraped pages to Markdown format.
import { formatToMarkdown } from "@vakra-dev/reader";
const markdown = formatToMarkdown(
pages,
"https://example.com",
new Date().toISOString(),
1500,
metadata
);Convert scraped pages to a complete HTML document.
import { formatToHTML } from "@vakra-dev/reader";
const html = formatToHTML(
pages,
"https://example.com",
new Date().toISOString(),
1500,
metadata
);Convert scraped pages to structured JSON.
import { formatToJson } from "@vakra-dev/reader";
const json = formatToJson(
pages,
"https://example.com",
new Date().toISOString(),
1500,
metadata
);Convert scraped pages to plain text.
import { formatToText } from "@vakra-dev/reader";
const text = formatToText(
pages,
"https://example.com",
new Date().toISOString(),
1500,
metadata
);Remove navigation, ads, scripts, and other non-content elements from HTML.
import { cleanContent } from "@vakra-dev/reader";
const cleanHtml = cleanContent(rawHtml);Extract metadata from HTML including Open Graph and Twitter cards.
import { extractMetadata } from "@vakra-dev/reader";
const metadata = extractMetadata(html);
console.log(metadata.title);
console.log(metadata.openGraph?.image);const DEFAULT_OPTIONS = {
formats: ["markdown"],
includeMetadata: true,
timeoutMs: 30000,
includePatterns: [],
excludePatterns: [],
batchConcurrency: 1,
batchTimeoutMs: 300000,
maxRetries: 2,
verbose: false,
showChrome: false,
};
const DEFAULT_CRAWL_OPTIONS = {
depth: 1,
maxPages: 20,
scrape: false,
delayMs: 1000,
formats: ["markdown", "html"],
scrapeConcurrency: 2,
verbose: false,
showChrome: false,
};
const DEFAULT_POOL_CONFIG = {
size: 2,
retireAfterPages: 100,
retireAfterMinutes: 30,
maxQueueSize: 100,
healthCheckIntervalMs: 300000,
};- Getting Started - Quick start guide
- Architecture - System design
- Browser Pool Guide - Pool management
- Cloudflare Bypass Guide - Challenge handling