From e8b79054c279bc8c19a4ff5d3005847c23cada7a Mon Sep 17 00:00:00 2001 From: Yuhan Lei Date: Wed, 25 Mar 2026 00:19:32 +0800 Subject: [PATCH] fix(pipeline): harden browser step retries --- src/download/index.test.ts | 4 +- src/download/index.ts | 38 +- src/pipeline/executor.test.ts | 26 ++ src/pipeline/executor.ts | 2 +- src/pipeline/steps/download.test.ts | 599 +++++++++++++++++++++++++++ src/pipeline/steps/download.ts | 112 ++++- src/pipeline/steps/intercept.test.ts | 59 +++ src/pipeline/steps/intercept.ts | 19 +- src/pipeline/steps/tap.test.ts | 16 + src/pipeline/steps/tap.ts | 7 +- 10 files changed, 857 insertions(+), 25 deletions(-) create mode 100644 src/pipeline/steps/download.test.ts create mode 100644 src/pipeline/steps/intercept.test.ts create mode 100644 src/pipeline/steps/tap.test.ts diff --git a/src/download/index.test.ts b/src/download/index.test.ts index 6329f173..3254fd37 100644 --- a/src/download/index.test.ts +++ b/src/download/index.test.ts @@ -49,11 +49,11 @@ describe('download helpers', () => { const destPath = path.join(tempDir, 'file.txt'); const result = await httpDownload(`${baseUrl}/loop`, destPath, { maxRedirects: 2 }); - expect(result).toEqual({ + expect(result).toEqual(expect.objectContaining({ success: false, size: 0, error: 'Too many redirects (> 2)', - }); + })); expect(fs.existsSync(destPath)).toBe(false); }); }); diff --git a/src/download/index.ts b/src/download/index.ts index 62554fb1..ab95020a 100644 --- a/src/download/index.ts +++ b/src/download/index.ts @@ -23,6 +23,15 @@ export interface DownloadOptions { maxRedirects?: number; } +export interface HttpDownloadResult { + success: boolean; + size: number; + error?: string; + statusCode?: number; + contentType?: string; + finalUrl?: string; +} + export interface YtdlpOptions { cookies?: string; cookiesFile?: string; @@ -82,7 +91,7 @@ export async function httpDownload( destPath: string, options: DownloadOptions = {}, redirectCount = 0, -): Promise<{ success: boolean; size: number; error?: string }> { +): Promise { const { cookies, headers = {}, timeout = 30000, onProgress, maxRedirects = 10 } = options; return new Promise((resolve) => { @@ -111,7 +120,7 @@ export async function httpDownload( file.close(); if (fs.existsSync(tempPath)) fs.unlinkSync(tempPath); if (redirectCount >= maxRedirects) { - resolve({ success: false, size: 0, error: `Too many redirects (> ${maxRedirects})` }); + resolve({ success: false, size: 0, error: `Too many redirects (> ${maxRedirects})`, finalUrl: url }); return; } httpDownload( @@ -126,7 +135,14 @@ export async function httpDownload( if (response.statusCode !== 200) { file.close(); if (fs.existsSync(tempPath)) fs.unlinkSync(tempPath); - resolve({ success: false, size: 0, error: `HTTP ${response.statusCode}` }); + resolve({ + success: false, + size: 0, + error: `HTTP ${response.statusCode}`, + statusCode: response.statusCode, + contentType: normalizeHeaderValue(response.headers['content-type']), + finalUrl: url, + }); return; } @@ -144,21 +160,27 @@ export async function httpDownload( file.close(); // Rename temp file to final destination fs.renameSync(tempPath, destPath); - resolve({ success: true, size: received }); + resolve({ + success: true, + size: received, + statusCode: response.statusCode, + contentType: normalizeHeaderValue(response.headers['content-type']), + finalUrl: url, + }); }); }); request.on('error', (err) => { file.close(); if (fs.existsSync(tempPath)) fs.unlinkSync(tempPath); - resolve({ success: false, size: 0, error: err.message }); + resolve({ success: false, size: 0, error: err.message, finalUrl: url }); }); request.on('timeout', () => { request.destroy(); file.close(); if (fs.existsSync(tempPath)) fs.unlinkSync(tempPath); - resolve({ success: false, size: 0, error: 'Timeout' }); + resolve({ success: false, size: 0, error: 'Timeout', finalUrl: url }); }); }); } @@ -167,6 +189,10 @@ export function resolveRedirectUrl(currentUrl: string, location: string): string return new URL(location, currentUrl).toString(); } +function normalizeHeaderValue(header: string | string[] | undefined): string | undefined { + return Array.isArray(header) ? header[0] : header; +} + /** * Export cookies to Netscape format for yt-dlp. */ diff --git a/src/pipeline/executor.test.ts b/src/pipeline/executor.test.ts index 2bc373ad..4658a9f9 100644 --- a/src/pipeline/executor.test.ts +++ b/src/pipeline/executor.test.ts @@ -4,6 +4,7 @@ import { describe, it, expect, vi } from 'vitest'; import { executePipeline } from './index.js'; +import { getStep, registerStep } from './registry.js'; import { ConfigError } from '../errors.js'; import type { IPage } from '../types.js'; @@ -189,4 +190,29 @@ describe('executePipeline', () => { expect(result).toEqual([{ a: 1 }]); expect(page.goto).toHaveBeenCalledWith('https://example.com'); }); + + it.each(['intercept', 'tap'])('retries transient browser errors for %s step', async (stepName) => { + const original = getStep(stepName); + expect(original).toBeDefined(); + + let attempts = 0; + registerStep(stepName, async () => { + attempts += 1; + if (attempts === 1) { + throw new Error('Extension disconnected'); + } + return 'ok'; + }); + + try { + const result = await executePipeline(null, [ + { [stepName]: {} }, + ]); + + expect(result).toBe('ok'); + expect(attempts).toBe(2); + } finally { + registerStep(stepName, original!); + } + }); }); diff --git a/src/pipeline/executor.ts b/src/pipeline/executor.ts index 2135beea..4e789fab 100644 --- a/src/pipeline/executor.ts +++ b/src/pipeline/executor.ts @@ -16,7 +16,7 @@ export interface PipelineContext { } /** Steps that interact with the browser and may fail transiently */ -const BROWSER_STEPS = new Set(['navigate', 'evaluate', 'click', 'type', 'press', 'wait', 'snapshot', 'scroll']); +const BROWSER_STEPS = new Set(['navigate', 'evaluate', 'click', 'type', 'press', 'wait', 'snapshot', 'scroll', 'intercept', 'tap', 'download']); export async function executePipeline( page: IPage | null, diff --git a/src/pipeline/steps/download.test.ts b/src/pipeline/steps/download.test.ts new file mode 100644 index 00000000..3089ed9b --- /dev/null +++ b/src/pipeline/steps/download.test.ts @@ -0,0 +1,599 @@ +/** + * Tests for pipeline/steps/download.ts. + */ + +import * as fs from 'node:fs'; +import * as http from 'node:http'; +import * as os from 'node:os'; +import * as path from 'node:path'; +import { afterEach, describe, expect, it, vi } from 'vitest'; +import * as downloadModule from '../../download/index.js'; +import { executePipeline } from '../index.js'; +import type { IPage } from '../../types.js'; + +const servers: http.Server[] = []; +const tempDirs: string[] = []; + +/** Clean up temp servers and download directories after each test. */ +afterEach(async () => { + await Promise.all(servers.map((server) => new Promise((resolve, reject) => { + server.close((err) => (err ? reject(err) : resolve())); + }))); + servers.length = 0; + + await Promise.all(tempDirs.map((dir) => fs.promises.rm(dir, { recursive: true, force: true }))); + tempDirs.length = 0; + vi.restoreAllMocks(); +}); + +/** Start a local HTTP server for download step tests. */ +async function startServer(handler: http.RequestListener): Promise { + const server = http.createServer(handler); + servers.push(server); + await new Promise((resolve) => server.listen(0, '127.0.0.1', resolve)); + const address = server.address(); + if (!address || typeof address === 'string') { + throw new Error('Failed to start test server'); + } + return `http://127.0.0.1:${address.port}`; +} + +/** Create a minimal browser page mock for download step tests. */ +function createMockPage(overrides: Partial = {}): IPage { + return { + goto: vi.fn(), + evaluate: vi.fn().mockResolvedValue(null), + getCookies: vi.fn().mockResolvedValue([]), + snapshot: vi.fn().mockResolvedValue(''), + click: vi.fn(), + typeText: vi.fn(), + pressKey: vi.fn(), + scrollTo: vi.fn(), + getFormState: vi.fn().mockResolvedValue({}), + wait: vi.fn(), + tabs: vi.fn().mockResolvedValue([]), + closeTab: vi.fn(), + newTab: vi.fn(), + selectTab: vi.fn(), + networkRequests: vi.fn().mockResolvedValue([]), + consoleMessages: vi.fn().mockResolvedValue(''), + scroll: vi.fn(), + autoScroll: vi.fn(), + installInterceptor: vi.fn(), + getInterceptedRequests: vi.fn().mockResolvedValue([]), + screenshot: vi.fn().mockResolvedValue(''), + ...overrides, + }; +} + +describe('stepDownload', () => { + it('retries when anonymous fallback lands on an HTML login page', async () => { + const baseUrl = await startServer((req, res) => { + if (req.url === '/login') { + res.statusCode = 200; + res.setHeader('Content-Type', 'text/html; charset=utf-8'); + res.end('login'); + return; + } + + if (req.headers.cookie !== 'sid=abc') { + res.statusCode = 302; + res.setHeader('Location', '/login'); + res.end(); + return; + } + + res.statusCode = 200; + res.setHeader('Content-Type', 'application/octet-stream'); + res.end('secret'); + }); + + const downloadDir = await fs.promises.mkdtemp(path.join(os.tmpdir(), 'opencli-pipeline-download-')); + tempDirs.push(downloadDir); + + const getCookies = vi.fn() + .mockRejectedValueOnce(new Error('Extension disconnected')) + .mockResolvedValue([{ name: 'sid', value: 'abc', domain: '127.0.0.1' }]); + + const page = createMockPage({ + evaluate: vi.fn().mockResolvedValue([{}]), + getCookies, + }); + + const result = await executePipeline(page, [ + { evaluate: '() => ([{}])' }, + { + download: { + url: `${baseUrl}/protected.bin`, + dir: downloadDir, + filename: 'protected.bin', + progress: false, + }, + }, + ], { args: {}, stepRetries: 2 }) as Array<{ _download: { status: string } }>; + + expect(getCookies).toHaveBeenCalledTimes(2); + expect(result[0]?._download.status).toBe('success'); + expect(fs.readFileSync(path.join(downloadDir, 'protected.bin'), 'utf8')).toBe('secret'); + }); + + it('retries when the original URL returns a 200 HTML login page', async () => { + const baseUrl = await startServer((req, res) => { + if (req.headers.cookie !== 'sid=abc') { + res.statusCode = 200; + res.setHeader('Content-Type', 'text/html; charset=utf-8'); + res.end('login'); + return; + } + + res.statusCode = 200; + res.setHeader('Content-Type', 'application/octet-stream'); + res.end('secret'); + }); + + const downloadDir = await fs.promises.mkdtemp(path.join(os.tmpdir(), 'opencli-pipeline-download-')); + tempDirs.push(downloadDir); + + const getCookies = vi.fn() + .mockRejectedValueOnce(new Error('Extension disconnected')) + .mockResolvedValue([{ name: 'sid', value: 'abc', domain: '127.0.0.1' }]); + + const page = createMockPage({ + evaluate: vi.fn().mockResolvedValue([{}]), + getCookies, + }); + + const result = await executePipeline(page, [ + { evaluate: '() => ([{}])' }, + { + download: { + url: `${baseUrl}/protected.bin`, + dir: downloadDir, + filename: 'protected.bin', + progress: false, + }, + }, + ], { args: {}, stepRetries: 2 }) as Array<{ _download: { status: string } }>; + + expect(getCookies).toHaveBeenCalledTimes(2); + expect(result[0]?._download.status).toBe('success'); + expect(fs.readFileSync(path.join(downloadDir, 'protected.bin'), 'utf8')).toBe('secret'); + }); + + it('retries when a protected json URL returns a 200 HTML login page', async () => { + const baseUrl = await startServer((req, res) => { + if (req.headers.cookie !== 'sid=abc') { + res.statusCode = 200; + res.setHeader('Content-Type', 'text/html; charset=utf-8'); + res.end('login'); + return; + } + + res.statusCode = 200; + res.setHeader('Content-Type', 'application/json'); + res.end('{"secret":true}'); + }); + + const downloadDir = await fs.promises.mkdtemp(path.join(os.tmpdir(), 'opencli-pipeline-download-')); + tempDirs.push(downloadDir); + + const getCookies = vi.fn() + .mockRejectedValueOnce(new Error('Extension disconnected')) + .mockResolvedValue([{ name: 'sid', value: 'abc', domain: '127.0.0.1' }]); + + const page = createMockPage({ + evaluate: vi.fn().mockResolvedValue([{}]), + getCookies, + }); + + const result = await executePipeline(page, [ + { evaluate: '() => ([{}])' }, + { + download: { + url: `${baseUrl}/protected.json`, + dir: downloadDir, + filename: 'protected.json', + progress: false, + }, + }, + ], { args: {}, stepRetries: 2 }) as Array<{ _download: { status: string } }>; + + expect(getCookies).toHaveBeenCalledTimes(2); + expect(result[0]?._download.status).toBe('success'); + expect(fs.readFileSync(path.join(downloadDir, 'protected.json'), 'utf8')).toBe('{"secret":true}'); + }); + + it('retries when a protected extensionless URL returns a 200 HTML login page', async () => { + const baseUrl = await startServer((req, res) => { + if (req.headers.cookie !== 'sid=abc') { + res.statusCode = 200; + res.setHeader('Content-Type', 'text/html; charset=utf-8'); + res.end('login'); + return; + } + + res.statusCode = 200; + res.setHeader('Content-Type', 'application/octet-stream'); + res.end('secret'); + }); + + const downloadDir = await fs.promises.mkdtemp(path.join(os.tmpdir(), 'opencli-pipeline-download-')); + tempDirs.push(downloadDir); + + const getCookies = vi.fn() + .mockRejectedValueOnce(new Error('Extension disconnected')) + .mockResolvedValue([{ name: 'sid', value: 'abc', domain: '127.0.0.1' }]); + + const page = createMockPage({ + evaluate: vi.fn().mockResolvedValue([{}]), + getCookies, + }); + + const result = await executePipeline(page, [ + { evaluate: '() => ([{}])' }, + { + download: { + url: `${baseUrl}/download/123`, + dir: downloadDir, + filename: 'file.bin', + progress: false, + }, + }, + ], { args: {}, stepRetries: 2 }) as Array<{ _download: { status: string } }>; + + expect(getCookies).toHaveBeenCalledTimes(2); + expect(result[0]?._download.status).toBe('success'); + expect(fs.readFileSync(path.join(downloadDir, 'file.bin'), 'utf8')).toBe('secret'); + }); + + it('retries when anonymous fallback gets a 404 for a protected file', async () => { + const baseUrl = await startServer((req, res) => { + if (req.headers.cookie !== 'sid=abc') { + res.statusCode = 404; + res.end('missing'); + return; + } + + res.statusCode = 200; + res.setHeader('Content-Type', 'application/octet-stream'); + res.end('secret'); + }); + + const downloadDir = await fs.promises.mkdtemp(path.join(os.tmpdir(), 'opencli-pipeline-download-')); + tempDirs.push(downloadDir); + + const getCookies = vi.fn() + .mockRejectedValueOnce(new Error('Extension disconnected')) + .mockResolvedValue([{ name: 'sid', value: 'abc', domain: '127.0.0.1' }]); + + const page = createMockPage({ + evaluate: vi.fn().mockResolvedValue([{}]), + getCookies, + }); + + const result = await executePipeline(page, [ + { evaluate: '() => ([{}])' }, + { + download: { + url: `${baseUrl}/hidden.bin`, + dir: downloadDir, + filename: 'hidden.bin', + progress: false, + }, + }, + ], { args: {}, stepRetries: 2 }) as Array<{ _download: { status: string } }>; + + expect(getCookies).toHaveBeenCalledTimes(2); + expect(result[0]?._download.status).toBe('success'); + expect(fs.readFileSync(path.join(downloadDir, 'hidden.bin'), 'utf8')).toBe('secret'); + }); + + it('falls back to anonymous download when public files do not need cookies', async () => { + const baseUrl = await startServer((_req, res) => { + res.statusCode = 200; + res.end('public'); + }); + + const downloadDir = await fs.promises.mkdtemp(path.join(os.tmpdir(), 'opencli-pipeline-download-')); + tempDirs.push(downloadDir); + + const getCookies = vi.fn() + .mockRejectedValue(new Error('Extension disconnected')); + + const page = createMockPage({ + evaluate: vi.fn().mockResolvedValue([{}]), + getCookies, + }); + + const result = await executePipeline(page, [ + { evaluate: '() => ([{}])' }, + { + download: { + url: `${baseUrl}/public.txt`, + dir: downloadDir, + filename: 'public.txt', + progress: false, + }, + }, + ], { args: {}, stepRetries: 2 }) as Array<{ _download: { status: string } }>; + + expect(getCookies).toHaveBeenCalledTimes(1); + expect(result[0]?._download.status).toBe('success'); + expect(fs.readFileSync(path.join(downloadDir, 'public.txt'), 'utf8')).toBe('public'); + }); + + it('keeps successful anonymous fallback for public HTML without an extension', async () => { + const baseUrl = await startServer((_req, res) => { + res.statusCode = 200; + res.setHeader('Content-Type', 'text/html; charset=utf-8'); + res.end('terms'); + }); + + const downloadDir = await fs.promises.mkdtemp(path.join(os.tmpdir(), 'opencli-pipeline-download-')); + tempDirs.push(downloadDir); + + const getCookies = vi.fn() + .mockRejectedValue(new Error('Extension disconnected')); + + const page = createMockPage({ + evaluate: vi.fn().mockResolvedValue([{}]), + getCookies, + }); + + const result = await executePipeline(page, [ + { evaluate: '() => ([{}])' }, + { + download: { + url: `${baseUrl}/terms`, + dir: downloadDir, + filename: 'terms.html', + progress: false, + }, + }, + ], { args: {}, stepRetries: 2 }) as Array<{ _download: { status: string } }>; + + expect(getCookies).toHaveBeenCalledTimes(1); + expect(result[0]?._download.status).toBe('success'); + expect(fs.readFileSync(path.join(downloadDir, 'terms.html'), 'utf8')).toContain('terms'); + }); + + it('keeps successful anonymous fallback for public HTML with the default filename', async () => { + const baseUrl = await startServer((_req, res) => { + res.statusCode = 200; + res.setHeader('Content-Type', 'text/html; charset=utf-8'); + res.end('terms'); + }); + + const downloadDir = await fs.promises.mkdtemp(path.join(os.tmpdir(), 'opencli-pipeline-download-')); + tempDirs.push(downloadDir); + + const getCookies = vi.fn() + .mockRejectedValue(new Error('Extension disconnected')); + + const page = createMockPage({ + evaluate: vi.fn().mockResolvedValue([{}]), + getCookies, + }); + + const result = await executePipeline(page, [ + { evaluate: '() => ([{}])' }, + { + download: { + url: `${baseUrl}/terms`, + dir: downloadDir, + progress: false, + }, + }, + ], { args: {}, stepRetries: 2 }) as Array<{ _download: { status: string; path?: string } }>; + + expect(getCookies).toHaveBeenCalledTimes(1); + expect(result[0]?._download.status).toBe('success'); + expect(result[0]?._download.path).toBeTruthy(); + expect(fs.readFileSync(result[0]!._download.path!, 'utf8')).toContain('terms'); + }); + + it('keeps successful anonymous fallback for public HTML on a php route', async () => { + const baseUrl = await startServer((_req, res) => { + res.statusCode = 200; + res.setHeader('Content-Type', 'text/html; charset=utf-8'); + res.end('terms'); + }); + + const downloadDir = await fs.promises.mkdtemp(path.join(os.tmpdir(), 'opencli-pipeline-download-')); + tempDirs.push(downloadDir); + + const getCookies = vi.fn() + .mockRejectedValue(new Error('Extension disconnected')); + + const page = createMockPage({ + evaluate: vi.fn().mockResolvedValue([{}]), + getCookies, + }); + + const result = await executePipeline(page, [ + { evaluate: '() => ([{}])' }, + { + download: { + url: `${baseUrl}/terms.php`, + dir: downloadDir, + progress: false, + }, + }, + ], { args: {}, stepRetries: 2 }) as Array<{ _download: { status: string; path?: string } }>; + + expect(getCookies).toHaveBeenCalledTimes(1); + expect(result[0]?._download.status).toBe('success'); + expect(result[0]?._download.path).toBeTruthy(); + expect(fs.readFileSync(result[0]!._download.path!, 'utf8')).toContain('terms'); + }); + + it('retries when a protected php route returns a 200 HTML login page for a binary file', async () => { + const baseUrl = await startServer((req, res) => { + if (req.headers.cookie !== 'sid=abc') { + res.statusCode = 200; + res.setHeader('Content-Type', 'text/html; charset=utf-8'); + res.end('login'); + return; + } + + res.statusCode = 200; + res.setHeader('Content-Type', 'application/octet-stream'); + res.end('secret'); + }); + + const downloadDir = await fs.promises.mkdtemp(path.join(os.tmpdir(), 'opencli-pipeline-download-')); + tempDirs.push(downloadDir); + + const getCookies = vi.fn() + .mockRejectedValueOnce(new Error('Extension disconnected')) + .mockResolvedValue([{ name: 'sid', value: 'abc', domain: '127.0.0.1' }]); + + const page = createMockPage({ + evaluate: vi.fn().mockResolvedValue([{}]), + getCookies, + }); + + const result = await executePipeline(page, [ + { evaluate: '() => ([{}])' }, + { + download: { + url: `${baseUrl}/download.php?id=123`, + dir: downloadDir, + filename: 'file.bin', + progress: false, + }, + }, + ], { args: {}, stepRetries: 2 }) as Array<{ _download: { status: string } }>; + + expect(getCookies).toHaveBeenCalledTimes(2); + expect(result[0]?._download.status).toBe('success'); + expect(fs.readFileSync(path.join(downloadDir, 'file.bin'), 'utf8')).toBe('secret'); + }); + + it('retries when a document download returns a 200 HTML login page for a json file', async () => { + const baseUrl = await startServer((req, res) => { + if (req.headers.cookie !== 'sid=abc') { + res.statusCode = 200; + res.setHeader('Content-Type', 'text/html; charset=utf-8'); + res.end('login'); + return; + } + + res.statusCode = 200; + res.setHeader('Content-Type', 'application/json'); + res.end('{"secret":true}'); + }); + + const downloadDir = await fs.promises.mkdtemp(path.join(os.tmpdir(), 'opencli-pipeline-download-')); + tempDirs.push(downloadDir); + + const getCookies = vi.fn() + .mockRejectedValueOnce(new Error('Extension disconnected')) + .mockResolvedValue([{ name: 'sid', value: 'abc', domain: '127.0.0.1' }]); + + const page = createMockPage({ + evaluate: vi.fn().mockResolvedValue([{}]), + getCookies, + }); + + const result = await executePipeline(page, [ + { evaluate: '() => ([{}])' }, + { + download: { + url: `${baseUrl}/protected.json`, + dir: downloadDir, + filename: 'protected.json', + type: 'document', + progress: false, + }, + }, + ], { args: {}, stepRetries: 2 }) as Array<{ _download: { status: string } }>; + + expect(getCookies).toHaveBeenCalledTimes(2); + expect(result[0]?._download.status).toBe('success'); + expect(fs.readFileSync(path.join(downloadDir, 'protected.json'), 'utf8')).toBe('{"secret":true}'); + }); + + it('retries when browser cookie extraction fails with a transient disconnect', async () => { + const baseUrl = await startServer((req, res) => { + if (req.headers.cookie !== 'sid=abc') { + res.statusCode = 403; + res.end('forbidden'); + return; + } + + res.statusCode = 200; + res.end('ok'); + }); + + const downloadDir = await fs.promises.mkdtemp(path.join(os.tmpdir(), 'opencli-pipeline-download-')); + tempDirs.push(downloadDir); + + const getCookies = vi.fn() + .mockRejectedValueOnce(new Error('Extension disconnected')) + .mockResolvedValue([{ name: 'sid', value: 'abc', domain: '127.0.0.1' }]); + + const page = createMockPage({ + evaluate: vi.fn().mockResolvedValue([{}]), + getCookies, + }); + + const result = await executePipeline(page, [ + { evaluate: '() => ([{}])' }, + { + download: { + url: `${baseUrl}/file.txt`, + dir: downloadDir, + filename: 'file.txt', + progress: false, + }, + }, + ], { args: {}, stepRetries: 2 }) as Array<{ _download: { status: string } }>; + + expect(getCookies).toHaveBeenCalledTimes(2); + expect(result[0]?._download.status).toBe('success'); + expect(fs.readFileSync(path.join(downloadDir, 'file.txt'), 'utf8')).toBe('ok'); + }); + + it('retries yt-dlp cookie export when Netscape cookie extraction hits a transient disconnect', async () => { + const ytdlpDownload = vi.spyOn(downloadModule, 'ytdlpDownload').mockResolvedValue({ + success: true, + size: 1, + }); + + const downloadDir = await fs.promises.mkdtemp(path.join(os.tmpdir(), 'opencli-pipeline-download-')); + tempDirs.push(downloadDir); + + const getCookies = vi.fn() + .mockRejectedValueOnce(new Error('Extension disconnected')) + .mockResolvedValueOnce([{ name: 'sid', value: 'abc', domain: 'example.com' }]); + + const page = createMockPage({ + evaluate: vi.fn().mockResolvedValue([{}]), + getCookies, + }); + + const result = await executePipeline(page, [ + { evaluate: '() => ([{}])' }, + { + download: { + url: 'https://example.com/video.mp4', + dir: downloadDir, + filename: 'video.mp4', + use_ytdlp: true, + progress: false, + }, + }, + ], { args: {}, stepRetries: 2 }) as Array<{ _download: { status: string } }>; + + expect(getCookies).toHaveBeenCalledTimes(2); + expect(ytdlpDownload).toHaveBeenCalledWith( + 'https://example.com/video.mp4', + path.join(downloadDir, 'video.mp4'), + expect.objectContaining({ + cookiesFile: expect.any(String), + }), + ); + expect(result[0]?._download.status).toBe('success'); + }); +}); diff --git a/src/pipeline/steps/download.ts b/src/pipeline/steps/download.ts index 99d09400..f0534774 100644 --- a/src/pipeline/steps/download.ts +++ b/src/pipeline/steps/download.ts @@ -15,6 +15,7 @@ import type { IPage } from '../../types.js'; import { render } from '../template.js'; import { httpDownload, + type HttpDownloadResult, ytdlpDownload, saveDocument, detectContentType, @@ -36,6 +37,67 @@ export interface DownloadResult { duration?: number; } +/** Let executor retry when browser cookie access fails due to a transient disconnect. */ +function isTransientBrowserError(error: unknown): boolean { + const message = error instanceof Error ? error.message : String(error); + return message.includes('Extension disconnected') + || message.includes('attach failed') + || message.includes('no longer exists') + || message.includes('CDP connection') + || message.includes('Daemon command failed'); +} + +const HTML_ROUTE_EXTENSIONS = new Set(['.html', '.htm', '.php', '.asp', '.aspx', '.jsp', '.jspx', '.cfm', '.cgi']); +const HTML_OUTPUT_EXTENSIONS = new Set(['.html', '.htm']); + +/** Decide whether this download target is explicitly or implicitly expected to be an HTML page. */ +function expectsHtmlOutput( + originalUrl: string, + destPath: string, + hasExplicitFilename: boolean, +): boolean { + if (hasExplicitFilename) { + return HTML_OUTPUT_EXTENSIONS.has(path.extname(destPath).toLowerCase()); + } + + const urlExt = path.extname(new URL(originalUrl).pathname).toLowerCase(); + if (HTML_ROUTE_EXTENSIONS.has(urlExt)) { + return true; + } + + if (!urlExt && !hasExplicitFilename) { + return true; + } + + return false; +} + +/** Some protected downloads redirect anonymous requests to an HTML login page instead of the file. */ +function isSuspiciousAnonymousSuccess( + originalUrl: string, + destPath: string, + hasExplicitFilename: boolean, + result: HttpDownloadResult, +): boolean { + const contentType = result.contentType?.toLowerCase() ?? ''; + const htmlExpected = expectsHtmlOutput(originalUrl, destPath, hasExplicitFilename); + return contentType.startsWith('text/html') + && (!htmlExpected || (result.finalUrl !== undefined && result.finalUrl !== originalUrl)); +} + +/** Decide whether an anonymous fallback result is weak enough that browser cookie retry should win. */ +function shouldRetryAnonymousFallback( + originalUrl: string, + destPath: string, + hasExplicitFilename: boolean, + result: HttpDownloadResult, +): boolean { + if (result.success) { + return isSuspiciousAnonymousSuccess(originalUrl, destPath, hasExplicitFilename, result); + } + + return result.statusCode === 401 || result.statusCode === 403 || result.statusCode === 404; +} /** @@ -45,7 +107,10 @@ async function extractBrowserCookies(page: IPage, domain?: string): Promise { const url = String(render(urlTemplate, { args, data, item, index })); @@ -147,12 +212,17 @@ export async function stepDownload( cookiesFile = path.join(tempDir, `cookies_${Date.now()}.txt`); exportCookiesToNetscape(cookiesArray, cookiesFile); } - } catch { - // Ignore cookie extraction errors + } catch (error) { + if (isTransientBrowserError(error)) { + throw error; + } + // Ignore non-transient cookie extraction errors } } } + let retryableStepError: unknown; + // Process downloads with concurrency const results = await mapConcurrent(items, concurrency, async (item, index): Promise => { const startTime = Date.now(); @@ -198,7 +268,7 @@ export async function stepDownload( const detectedType = contentType === 'auto' ? detectContentType(url) : contentType; const shouldUseYtdlp = useYtdlp || (detectedType === 'video' && requiresYtdlp(url)); - let result: { success: boolean; size: number; error?: string }; + let result: HttpDownloadResult; try { if (detectedType === 'document' && contentTemplate) { @@ -234,6 +304,20 @@ export async function stepDownload( } } else { // Direct HTTP download + let cookies = ''; + let cookieError: unknown; + if (page) { + try { + cookies = await extractBrowserCookies(page); + } catch (error) { + if (isTransientBrowserError(error)) { + cookieError = error; + } else { + throw error; + } + } + } + result = await httpDownload(url, destPath, { cookies, timeout, @@ -244,6 +328,14 @@ export async function stepDownload( }, }); + // Public files should still download anonymously when cookie access is flaky. + if (cookieError && shouldRetryAnonymousFallback(url, destPath, Boolean(filenameTemplate), result)) { + if (result.success && fs.existsSync(destPath)) { + fs.unlinkSync(destPath); + } + retryableStepError ??= cookieError; + } + if (progressBar) { progressBar.complete(result.success, result.success ? formatBytes(result.size) : undefined); } @@ -283,5 +375,9 @@ export async function stepDownload( // Show summary tracker.finish(); + if (retryableStepError) { + throw retryableStepError; + } + return results; } diff --git a/src/pipeline/steps/intercept.test.ts b/src/pipeline/steps/intercept.test.ts new file mode 100644 index 00000000..2f94bc37 --- /dev/null +++ b/src/pipeline/steps/intercept.test.ts @@ -0,0 +1,59 @@ +/** + * Tests for pipeline/steps/intercept.ts. + */ + +import { describe, expect, it, vi } from 'vitest'; +import { ConfigError } from '../../errors.js'; +import { stepIntercept } from './intercept.js'; +import type { IPage } from '../../types.js'; + +/** Create a minimal browser page mock for intercept step tests. */ +function createMockPage(overrides: Partial = {}): IPage { + return { + goto: vi.fn(), + evaluate: vi.fn().mockResolvedValue(null), + getCookies: vi.fn().mockResolvedValue([]), + snapshot: vi.fn().mockResolvedValue(''), + click: vi.fn(), + typeText: vi.fn(), + pressKey: vi.fn(), + scrollTo: vi.fn(), + getFormState: vi.fn().mockResolvedValue({}), + wait: vi.fn(), + tabs: vi.fn().mockResolvedValue([]), + closeTab: vi.fn(), + newTab: vi.fn(), + selectTab: vi.fn(), + networkRequests: vi.fn().mockResolvedValue([]), + consoleMessages: vi.fn().mockResolvedValue(''), + scroll: vi.fn(), + autoScroll: vi.fn(), + installInterceptor: vi.fn(), + getInterceptedRequests: vi.fn().mockResolvedValue([]), + screenshot: vi.fn().mockResolvedValue(''), + ...overrides, + }; +} + +describe('stepIntercept', () => { + it('throws ConfigError when browser session is missing', async () => { + await expect(stepIntercept(null, { capture: '/api/posts' }, null, {})).rejects.toBeInstanceOf(ConfigError); + await expect(stepIntercept(null, { capture: '/api/posts' }, null, {})).rejects.toThrow( + 'intercept step requires a browser session', + ); + }); + + it('waits for the configured timeout without truncating it to 3 seconds', async () => { + const page = createMockPage({ + evaluate: vi.fn() + .mockResolvedValueOnce(null) + .mockResolvedValueOnce([{ id: 1 }]), + wait: vi.fn(), + }); + + const result = await stepIntercept(page, { capture: '/api/posts', timeout: 12 }, null, {}); + + expect(result).toEqual({ id: 1 }); + expect(page.wait).toHaveBeenCalledWith(12); + }); +}); diff --git a/src/pipeline/steps/intercept.ts b/src/pipeline/steps/intercept.ts index 26b60d9e..88fe5c84 100644 --- a/src/pipeline/steps/intercept.ts +++ b/src/pipeline/steps/intercept.ts @@ -3,10 +3,15 @@ */ import type { IPage } from '../../types.js'; +import { ConfigError } from '../../errors.js'; import { render, normalizeEvaluateSource } from '../template.js'; import { generateInterceptorJs, generateReadInterceptedJs } from '../../interceptor.js'; export async function stepIntercept(page: IPage | null, params: any, data: any, args: Record): Promise { + if (!page) { + throw new ConfigError('intercept step requires a browser session', 'Set browser: true in your command definition.'); + } + const cfg = typeof params === 'object' ? params : {}; const trigger = cfg.trigger ?? ''; const capturePattern = cfg.capture ?? ''; @@ -16,27 +21,27 @@ export async function stepIntercept(page: IPage | null, params: any, data: any, if (!capturePattern) return data; // Step 1: Inject fetch/XHR interceptor BEFORE trigger - await page!.evaluate(generateInterceptorJs(JSON.stringify(capturePattern))); + await page.evaluate(generateInterceptorJs(JSON.stringify(capturePattern))); // Step 2: Execute the trigger action if (trigger.startsWith('navigate:')) { const url = render(trigger.slice('navigate:'.length), { args, data }); - await page!.goto(String(url)); + await page.goto(String(url)); } else if (trigger.startsWith('evaluate:')) { const js = trigger.slice('evaluate:'.length); - await page!.evaluate(normalizeEvaluateSource(render(js, { args, data }) as string)); + await page.evaluate(normalizeEvaluateSource(render(js, { args, data }) as string)); } else if (trigger.startsWith('click:')) { const ref = render(trigger.slice('click:'.length), { args, data }); - await page!.click(String(ref).replace(/^@/, '')); + await page.click(String(ref).replace(/^@/, '')); } else if (trigger === 'scroll') { - await page!.scroll('down'); + await page.scroll('down'); } // Step 3: Wait a bit for network requests to fire - await page!.wait(Math.min(timeout, 3)); + await page.wait(timeout); // Step 4: Retrieve captured data - const matchingResponses = await page!.evaluate(generateReadInterceptedJs()); + const matchingResponses = await page.evaluate(generateReadInterceptedJs()); // Step 5: Select from response if specified let result = matchingResponses.length === 1 ? matchingResponses[0] : diff --git a/src/pipeline/steps/tap.test.ts b/src/pipeline/steps/tap.test.ts new file mode 100644 index 00000000..f76ddfbe --- /dev/null +++ b/src/pipeline/steps/tap.test.ts @@ -0,0 +1,16 @@ +/** + * Tests for pipeline/steps/tap.ts. + */ + +import { describe, expect, it } from 'vitest'; +import { ConfigError } from '../../errors.js'; +import { stepTap } from './tap.js'; + +describe('stepTap', () => { + it('throws ConfigError when browser session is missing', async () => { + await expect(stepTap(null, { store: 'feed', action: 'load' }, null, {})).rejects.toBeInstanceOf(ConfigError); + await expect(stepTap(null, { store: 'feed', action: 'load' }, null, {})).rejects.toThrow( + 'tap step requires a browser session', + ); + }); +}); diff --git a/src/pipeline/steps/tap.ts b/src/pipeline/steps/tap.ts index 9a58a20f..da16a01a 100644 --- a/src/pipeline/steps/tap.ts +++ b/src/pipeline/steps/tap.ts @@ -10,10 +10,15 @@ */ import type { IPage } from '../../types.js'; +import { ConfigError } from '../../errors.js'; import { render } from '../template.js'; import { generateTapInterceptorJs } from '../../interceptor.js'; export async function stepTap(page: IPage | null, params: any, data: any, args: Record): Promise { + if (!page) { + throw new ConfigError('tap step requires a browser session', 'Set browser: true in your command definition.'); + } + const cfg = typeof params === 'object' ? params : {}; const storeName = String(render(cfg.store ?? '', { args, data })); const actionName = String(render(cfg.action ?? '', { args, data })); @@ -96,5 +101,5 @@ export async function stepTap(page: IPage | null, params: any, data: any, args: } `; - return page!.evaluate(js); + return page.evaluate(js); }