diff --git a/apps/api/.gitignore b/apps/api/.gitignore index dfd2c2e37b..b92a5f112e 100644 --- a/apps/api/.gitignore +++ b/apps/api/.gitignore @@ -47,6 +47,9 @@ lerna-debug.log* .env.production.local .env.local +# Local scripts +scripts/ + # temp directory .temp .tmp diff --git a/apps/api/src/questionnaire/questionnaire.controller.ts b/apps/api/src/questionnaire/questionnaire.controller.ts index 9ea0369f8f..24a20d693b 100644 --- a/apps/api/src/questionnaire/questionnaire.controller.ts +++ b/apps/api/src/questionnaire/questionnaire.controller.ts @@ -263,12 +263,12 @@ export class QuestionnaireController { @ApiConsumes('application/json') @ApiOkResponse({ description: - 'Upload file, parse questions (no answers), save to DB, return questionnaireId', + 'Upload file and trigger async parsing. Returns runId for realtime tracking.', schema: { type: 'object', properties: { - questionnaireId: { type: 'string' }, - totalQuestions: { type: 'number' }, + runId: { type: 'string' }, + publicAccessToken: { type: 'string' }, }, }, }) diff --git a/apps/api/src/questionnaire/questionnaire.service.ts b/apps/api/src/questionnaire/questionnaire.service.ts index 71bda767fb..6714c441d5 100644 --- a/apps/api/src/questionnaire/questionnaire.service.ts +++ b/apps/api/src/questionnaire/questionnaire.service.ts @@ -2,6 +2,8 @@ import { Injectable, Logger, NotFoundException } from '@nestjs/common'; import type { AnswerQuestionResult } from '@/trigger/questionnaire/answer-question'; import { answerQuestion } from '@/trigger/questionnaire/answer-question'; import { generateAnswerWithRAGBatch } from '@/trigger/questionnaire/answer-question-helpers'; +import { tasks } from '@trigger.dev/sdk'; +import type { parseQuestionnaireTask } from '@/trigger/questionnaire/parse-questionnaire'; import { ParseQuestionnaireDto } from './dto/parse-questionnaire.dto'; import { ExportQuestionnaireDto, @@ -211,7 +213,8 @@ export class QuestionnaireService { async uploadAndParse( dto: UploadAndParseDto, - ): Promise<{ questionnaireId: string; totalQuestions: number }> { + ): Promise<{ runId: string; publicAccessToken: string }> { + // Upload file to S3 first const uploadInfo = await uploadQuestionnaireFile({ organizationId: dto.organizationId, fileName: dto.fileName, @@ -220,38 +223,32 @@ export class QuestionnaireService { source: dto.source || 'internal', }); - // Use AI-powered extraction (faster, handles all file formats) - const questionsAndAnswers = await extractQuestionsWithAI( - dto.fileData, - dto.fileType, - this.contentLogger, - ); + if (!uploadInfo) { + throw new Error('Failed to upload questionnaire file to S3'); + } - const questionnaireId = await persistQuestionnaireResult( + // Trigger async processing via Trigger.dev + const handle = await tasks.trigger( + 'parse-questionnaire', { + inputType: 's3' as const, organizationId: dto.organizationId, + s3Key: uploadInfo.s3Key, fileName: dto.fileName, fileType: dto.fileType, - fileSize: - uploadInfo?.fileSize ?? Buffer.from(dto.fileData, 'base64').length, - s3Key: uploadInfo?.s3Key ?? null, - questionsAndAnswers: questionsAndAnswers.map((qa) => ({ - question: qa.question, - answer: null, - sources: undefined, - })), - source: dto.source || 'internal', + fileSize: uploadInfo.fileSize, }, - this.storageLogger, ); - if (!questionnaireId) { - throw new Error('Failed to save questionnaire'); - } + this.logger.log('Triggered async questionnaire parsing', { + runId: handle.id, + s3Key: uploadInfo.s3Key, + fileName: dto.fileName, + }); return { - questionnaireId, - totalQuestions: questionsAndAnswers.length, + runId: handle.id, + publicAccessToken: handle.publicAccessToken, }; } diff --git a/apps/api/src/questionnaire/utils/content-extractor.ts b/apps/api/src/questionnaire/utils/content-extractor.ts index 5608801d20..77c5348fca 100644 --- a/apps/api/src/questionnaire/utils/content-extractor.ts +++ b/apps/api/src/questionnaire/utils/content-extractor.ts @@ -4,6 +4,7 @@ import { createGroq } from '@ai-sdk/groq'; import { generateText, generateObject, jsonSchema } from 'ai'; import ExcelJS from 'exceljs'; import AdmZip from 'adm-zip'; +import mammoth from 'mammoth'; import { PARSING_MODEL, VISION_EXTRACTION_PROMPT } from './constants'; /** @@ -82,20 +83,31 @@ export async function extractContentFromFile( return fileBuffer.toString('utf-8'); } - // Handle Word documents - not directly supported - if (isWordDocument(fileType)) { + // Handle Word documents (.docx) — extract text with mammoth + if (isDocxFile(fileType)) { + const result = await mammoth.extractRawText({ buffer: fileBuffer }); + return result.value; + } + + // Legacy .doc files are not supported + if (fileType === 'application/msword') { throw new Error( - 'Word documents (.docx) are best converted to PDF or image format for parsing. Alternatively, use a URL to view the document.', + 'Legacy Word documents (.doc) are not supported. Please convert to .docx or PDF format.', ); } - // For images and PDFs, use OpenAI vision API - if (isImageOrPdf(fileType)) { + // Handle PDFs using Claude's native multi-page PDF support + if (isPdfFile(fileType)) { + return extractFromPdf(fileData, logger); + } + + // Handle images using OpenAI vision API + if (isImageFile(fileType)) { return extractFromVision(fileData, fileType, logger); } throw new Error( - `Unsupported file type: ${fileType}. Supported formats: PDF, images (PNG, JPG, etc.), Excel (.xlsx, .xls), CSV, text files (.txt).`, + `Unsupported file type: ${fileType}. Supported formats: PDF, Word (.docx), images (PNG, JPG, etc.), Excel (.xlsx, .xls), CSV, text files (.txt).`, ); } @@ -132,8 +144,24 @@ export async function extractQuestionsWithAI( return await parseQuestionsWithGroq(content, logger); } - // For PDF/images - use vision - if (isImageOrPdf(fileType)) { + // For Word documents (.docx) - extract text then AI parsing + if (isDocxFile(fileType)) { + const fileBuffer = Buffer.from(fileData, 'base64'); + const result = await mammoth.extractRawText({ buffer: fileBuffer }); + logger.info('Extracted DOCX content', { + contentLength: result.value.length, + extractionMs: Date.now() - startTime, + }); + return await parseQuestionsWithGroq(result.value, logger); + } + + // For PDFs - use Claude's native PDF support + if (isPdfFile(fileType)) { + return await parseQuestionsFromPdf(fileData, logger); + } + + // For images - use OpenAI vision + if (isImageFile(fileType)) { return await parseQuestionsWithVision(fileData, fileType, logger); } @@ -382,7 +410,7 @@ async function parseQuestionsWithClaude( }); const { object } = await generateObject({ - model: anthropic('claude-3-5-sonnet-latest'), + model: anthropic('claude-sonnet-4-6'), schema: questionExtractionSchema, prompt: QUESTION_PROMPT + content.substring(0, 80000), }); @@ -394,7 +422,7 @@ async function parseQuestionsWithClaude( logger.info('Claude parsing complete', { questionCount: result.questions?.length || 0, durationMs: Date.now() - startTime, - model: 'claude-3-5-sonnet', + model: 'claude-sonnet-4-6', }); return (result.questions || []) @@ -529,16 +557,19 @@ function isTextFile(fileType: string): boolean { return fileType === 'text/plain' || fileType.startsWith('text/'); } -function isWordDocument(fileType: string): boolean { +function isDocxFile(fileType: string): boolean { return ( - fileType === 'application/msword' || fileType === - 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' + 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' ); } -function isImageOrPdf(fileType: string): boolean { - return fileType.startsWith('image/') || fileType === 'application/pdf'; +function isPdfFile(fileType: string): boolean { + return fileType === 'application/pdf'; +} + +function isImageFile(fileType: string): boolean { + return fileType.startsWith('image/'); } /** @@ -893,6 +924,117 @@ function extractFromCsv(fileBuffer: Buffer): string { .join('\n'); } +/** + * Extract raw text content from a PDF using Claude's native multi-page support + */ +async function extractFromPdf( + fileData: string, + logger: ContentExtractionLogger, +): Promise { + const fileSizeMB = ( + Buffer.from(fileData, 'base64').length / + (1024 * 1024) + ).toFixed(2); + + logger.info('Extracting content from PDF using Claude', { + fileSizeMB, + }); + + const startTime = Date.now(); + + try { + const { text } = await generateText({ + model: anthropic('claude-sonnet-4-6'), + messages: [ + { + role: 'user', + content: [ + { type: 'text', text: VISION_EXTRACTION_PROMPT }, + { + type: 'file', + data: fileData, + mediaType: 'application/pdf', + }, + ], + }, + ], + }); + + const extractionTime = ((Date.now() - startTime) / 1000).toFixed(2); + logger.info('Content extracted from PDF', { + extractedLength: text.length, + extractionTimeSeconds: extractionTime, + }); + + return text; + } catch (error) { + const extractionTime = ((Date.now() - startTime) / 1000).toFixed(2); + logger.error('Failed to extract content from PDF', { + fileSizeMB, + extractionTimeSeconds: extractionTime, + error: error instanceof Error ? error.message : 'Unknown error', + }); + throw new Error( + `Failed to extract PDF content: ${error instanceof Error ? error.message : 'Unknown error'}`, + ); + } +} + +/** + * Extract questions directly from a PDF using Claude's native multi-page support + */ +async function parseQuestionsFromPdf( + fileData: string, + logger: ContentExtractionLogger, +): Promise<{ question: string; answer: string | null }[]> { + const startTime = Date.now(); + + const { object } = await generateObject({ + model: anthropic('claude-sonnet-4-6'), + schema: questionExtractionSchema, + messages: [ + { + role: 'user', + content: [ + { + type: 'text', + text: `Extract all questions/fields and their answers from this questionnaire or form document. + +Include: +- Traditional questions ending with "?" +- Form fields like "1.1 Vendor Name", "Contact Email" that request input +- Numbered items (1.1, 1.2, 2.1) followed by field labels +- Items marked with "*" or selection notes like "(Single selection allowed)" + +Match each to its response if provided. Set answer to null if empty.`, + }, + { + type: 'file', + data: fileData, + mediaType: 'application/pdf', + }, + ], + }, + ], + }); + + const result = object as { + questions: { question: string; answer: string | null }[]; + }; + + logger.info('PDF question parsing complete', { + questionCount: result.questions?.length || 0, + durationMs: Date.now() - startTime, + }); + + return (result.questions || []) + .map((q) => ({ + question: q.question?.trim() || '', + answer: q.answer?.trim() || null, + })) + .filter((q) => q.question); +} + async function extractFromVision( fileData: string, fileType: string, diff --git a/apps/api/src/trigger/questionnaire/parse-questionnaire.ts b/apps/api/src/trigger/questionnaire/parse-questionnaire.ts index fe2b2a5522..471bb078fd 100644 --- a/apps/api/src/trigger/questionnaire/parse-questionnaire.ts +++ b/apps/api/src/trigger/questionnaire/parse-questionnaire.ts @@ -239,12 +239,14 @@ export const parseQuestionnaireTask = task({ retry: { maxAttempts: 2, }, + maxDuration: 60 * 30, // 30 minutes (in seconds) for large PDF questionnaires run: async (payload: { inputType: 'file' | 'url' | 'attachment' | 's3'; organizationId: string; fileData?: string; fileName?: string; fileType?: string; + fileSize?: number; url?: string; attachmentId?: string; s3Key?: string; @@ -342,9 +344,10 @@ export const parseQuestionnaireTask = task({ 'questionnaire'; const s3Key = payload.s3Key || ''; const fileType = payload.fileType || 'application/octet-stream'; - const fileSize = payload.fileData - ? Buffer.from(payload.fileData, 'base64').length - : 0; + const fileSize = payload.fileSize + ?? (payload.fileData + ? Buffer.from(payload.fileData, 'base64').length + : 0); const questionnaire = await db.questionnaire.create({ data: { diff --git a/apps/api/src/trigger/vector-store/helpers/extract-content-from-file.spec.ts b/apps/api/src/trigger/vector-store/helpers/extract-content-from-file.spec.ts index f3a106ab09..c7455d0315 100644 --- a/apps/api/src/trigger/vector-store/helpers/extract-content-from-file.spec.ts +++ b/apps/api/src/trigger/vector-store/helpers/extract-content-from-file.spec.ts @@ -1,7 +1,8 @@ import { extractContentFromFile } from './extract-content-from-file'; import ExcelJS from 'exceljs'; +import { generateText } from 'ai'; -// Mock external dependencies that aren't relevant to Excel tests +// Mock external dependencies jest.mock('@/vector-store/logger', () => ({ logger: { info: jest.fn(), @@ -10,8 +11,12 @@ jest.mock('@/vector-store/logger', () => ({ }, })); +jest.mock('@ai-sdk/anthropic', () => ({ + anthropic: jest.fn(() => 'claude-mock-model'), +})); + jest.mock('@ai-sdk/openai', () => ({ - openai: jest.fn(), + openai: jest.fn(() => 'openai-mock-model'), })); jest.mock('ai', () => ({ @@ -25,6 +30,10 @@ jest.mock('mammoth', () => ({ }, })); +const mockGenerateText = generateText as jest.MockedFunction< + typeof generateText +>; + async function createTestExcelBuffer( sheets: { name: string; rows: (string | number)[][] }[], ): Promise { @@ -160,3 +169,90 @@ describe('extractContentFromFile - non-Excel types', () => { ).rejects.toThrow('Legacy Word documents'); }); }); + +describe('extractContentFromFile - PDF extraction', () => { + beforeEach(() => { + jest.clearAllMocks(); + }); + + it('should use Claude for PDF files', async () => { + const pdfContent = 'Extracted SOC 2 report content'; + mockGenerateText.mockResolvedValue({ + text: pdfContent, + } as Awaited>); + + const base64 = Buffer.from('fake-pdf-data').toString('base64'); + const result = await extractContentFromFile(base64, 'application/pdf'); + + expect(result).toBe(pdfContent); + expect(mockGenerateText).toHaveBeenCalledWith( + expect.objectContaining({ + model: 'claude-mock-model', + messages: expect.arrayContaining([ + expect.objectContaining({ + role: 'user', + content: expect.arrayContaining([ + expect.objectContaining({ type: 'text' }), + expect.objectContaining({ + type: 'file', + mediaType: 'application/pdf', + }), + ]), + }), + ]), + }), + ); + }); + + it('should throw on PDF extraction failure', async () => { + mockGenerateText.mockRejectedValue(new Error('API rate limit')); + + const base64 = Buffer.from('fake-pdf-data').toString('base64'); + + await expect( + extractContentFromFile(base64, 'application/pdf'), + ).rejects.toThrow('Failed to extract PDF content: API rate limit'); + }); +}); + +describe('extractContentFromFile - image extraction', () => { + beforeEach(() => { + jest.clearAllMocks(); + }); + + it('should use OpenAI vision for images', async () => { + const imageContent = 'Text from image'; + mockGenerateText.mockResolvedValue({ + text: imageContent, + } as Awaited>); + + const base64 = Buffer.from('fake-image-data').toString('base64'); + const result = await extractContentFromFile(base64, 'image/png'); + + expect(result).toBe(imageContent); + expect(mockGenerateText).toHaveBeenCalledWith( + expect.objectContaining({ + model: 'openai-mock-model', + messages: expect.arrayContaining([ + expect.objectContaining({ + role: 'user', + content: expect.arrayContaining([ + expect.objectContaining({ type: 'text' }), + expect.objectContaining({ type: 'image' }), + ]), + }), + ]), + }), + ); + }); + + it('should throw on image extraction failure', async () => { + mockGenerateText.mockRejectedValue(new Error('Vision API error')); + + const base64 = Buffer.from('fake-image-data').toString('base64'); + + await expect( + extractContentFromFile(base64, 'image/png'), + ).rejects.toThrow('Failed to extract image content: Vision API error'); + }); +}); diff --git a/apps/api/src/trigger/vector-store/helpers/extract-content-from-file.ts b/apps/api/src/trigger/vector-store/helpers/extract-content-from-file.ts index e57d84e9c3..d9f7916d80 100644 --- a/apps/api/src/trigger/vector-store/helpers/extract-content-from-file.ts +++ b/apps/api/src/trigger/vector-store/helpers/extract-content-from-file.ts @@ -1,4 +1,5 @@ import { logger } from '@/vector-store/logger'; +import { anthropic } from '@ai-sdk/anthropic'; import { openai } from '@ai-sdk/openai'; import { generateText } from 'ai'; import ExcelJS from 'exceljs'; @@ -200,20 +201,70 @@ export async function extractContentFromFile( ); } - // For images and PDFs, use OpenAI vision API - const isImage = fileType.startsWith('image/'); + // Handle PDFs using Claude's native multi-page PDF support const isPdf = fileType === 'application/pdf'; - if (isImage || isPdf) { - const base64Data = fileData; - const mimeType = fileType; - const fileSizeMB = ( - Buffer.from(fileData, 'base64').length / - (1024 * 1024) - ).toFixed(2); + if (isPdf) { + const fileSizeMB = (fileBuffer.length / (1024 * 1024)).toFixed(2); + + logger.info('Extracting content from PDF using Claude', { + fileType, + fileSizeMB, + }); + + const startTime = Date.now(); + + try { + const { text } = await generateText({ + model: anthropic('claude-sonnet-4-6'), + messages: [ + { + role: 'user', + content: [ + { + type: 'text', + text: 'Extract all text content from this document. Preserve the structure, formatting, and order of the content. Include all paragraphs, headings, lists, tables, and any other text elements. Return the extracted text in a clear, readable format.', + }, + { + type: 'file', + data: fileData, + mediaType: 'application/pdf', + }, + ], + }, + ], + }); + + const extractionTime = ((Date.now() - startTime) / 1000).toFixed(2); + logger.info('Content extracted from PDF', { + fileType, + extractedLength: text.length, + extractionTimeSeconds: extractionTime, + }); - logger.info('Extracting content from PDF/image using vision API', { - fileType: mimeType, + return text; + } catch (error) { + const extractionTime = ((Date.now() - startTime) / 1000).toFixed(2); + logger.error('Failed to extract content from PDF', { + fileType, + fileSizeMB, + extractionTimeSeconds: extractionTime, + error: error instanceof Error ? error.message : 'Unknown error', + }); + throw new Error( + `Failed to extract PDF content: ${error instanceof Error ? error.message : 'Unknown error'}`, + ); + } + } + + // Handle images using OpenAI vision API + const isImage = fileType.startsWith('image/'); + + if (isImage) { + const fileSizeMB = (fileBuffer.length / (1024 * 1024)).toFixed(2); + + logger.info('Extracting content from image using vision API', { + fileType, fileSizeMB, }); @@ -221,18 +272,18 @@ export async function extractContentFromFile( try { const { text } = await generateText({ - model: openai('gpt-4o-mini'), // Using gpt-4o-mini for better text extraction + model: openai('gpt-4o-mini'), messages: [ { role: 'user', content: [ { type: 'text', - text: `Extract all text content from this document. Preserve the structure, formatting, and order of the content. Include all paragraphs, headings, lists, tables, and any other text elements. Return the extracted text in a clear, readable format.`, + text: 'Extract all text content from this image. Preserve the structure, formatting, and order of the content. Include all paragraphs, headings, lists, tables, and any other text elements. Return the extracted text in a clear, readable format.', }, { type: 'image', - image: `data:${mimeType};base64,${base64Data}`, + image: `data:${fileType};base64,${fileData}`, }, ], }, @@ -240,8 +291,8 @@ export async function extractContentFromFile( }); const extractionTime = ((Date.now() - startTime) / 1000).toFixed(2); - logger.info('Content extracted from PDF/image', { - fileType: mimeType, + logger.info('Content extracted from image', { + fileType, extractedLength: text.length, extractionTimeSeconds: extractionTime, }); @@ -249,14 +300,14 @@ export async function extractContentFromFile( return text; } catch (error) { const extractionTime = ((Date.now() - startTime) / 1000).toFixed(2); - logger.error('Failed to extract content from PDF/image', { - fileType: mimeType, + logger.error('Failed to extract content from image', { + fileType, fileSizeMB, extractionTimeSeconds: extractionTime, error: error instanceof Error ? error.message : 'Unknown error', }); throw new Error( - `Failed to extract content: ${error instanceof Error ? error.message : 'Unknown error'}`, + `Failed to extract image content: ${error instanceof Error ? error.message : 'Unknown error'}`, ); } } diff --git a/apps/api/src/trigger/vector-store/process-knowledge-base-documents-orchestrator.ts b/apps/api/src/trigger/vector-store/process-knowledge-base-documents-orchestrator.ts index 2e74a47d59..296b0239e6 100644 --- a/apps/api/src/trigger/vector-store/process-knowledge-base-documents-orchestrator.ts +++ b/apps/api/src/trigger/vector-store/process-knowledge-base-documents-orchestrator.ts @@ -12,6 +12,7 @@ export const processKnowledgeBaseDocumentsOrchestratorTask = task({ retry: { maxAttempts: 3, }, + maxDuration: 60 * 60, // 1 hour (in seconds) for large document batches run: async (payload: { documentIds: string[]; organizationId: string }) => { logger.info('Starting Knowledge Base documents processing orchestrator', { organizationId: payload.organizationId, diff --git a/apps/api/src/vendors/vendors.service.ts b/apps/api/src/vendors/vendors.service.ts index d7831233c8..ae1a2acb39 100644 --- a/apps/api/src/vendors/vendors.service.ts +++ b/apps/api/src/vendors/vendors.service.ts @@ -280,7 +280,9 @@ export class VendorsService { const normalizedWebsite = normalizeWebsite(vendorWebsite); if (!normalizedWebsite) { - throw new Error('Vendor website is missing or invalid'); + throw new BadRequestException( + 'Vendor website is missing or invalid. Please add a website URL with http:// or https:// protocol before triggering an assessment.', + ); } const handle = await tasks.trigger('vendor-risk-assessment-task', { diff --git a/apps/app/src/app/(app)/[orgId]/questionnaire/hooks/useQuestionnaireParse.ts b/apps/app/src/app/(app)/[orgId]/questionnaire/hooks/useQuestionnaireParse.ts index 2361b3b2cb..6274ecbcce 100644 --- a/apps/app/src/app/(app)/[orgId]/questionnaire/hooks/useQuestionnaireParse.ts +++ b/apps/app/src/app/(app)/[orgId]/questionnaire/hooks/useQuestionnaireParse.ts @@ -1,6 +1,8 @@ 'use client'; import { api } from '@/lib/api-client'; +import { isFailureRunStatus } from '@/app/(app)/[orgId]/cloud-tests/status'; +import { useRealtimeRun } from '@trigger.dev/react-hooks'; import { useRouter } from 'next/navigation'; import { useCallback, useEffect, useRef, useState } from 'react'; import { toast } from 'sonner'; @@ -36,6 +38,8 @@ export function useQuestionnaireParse({ const router = useRouter(); const [uploadStatus, setUploadStatus] = useState('idle'); const [parseStatus, setParseStatus] = useState('idle'); + const [runId, setRunId] = useState(null); + const [runToken, setRunToken] = useState(null); const abortControllerRef = useRef(null); // Get trigger token for auto-answer (can trigger and read) @@ -61,6 +65,57 @@ export function useQuestionnaireParse({ } }, [autoAnswerToken, setAutoAnswerToken]); + // Track the parse task via realtime + const { run: parseRun } = useRealtimeRun(runId ?? '', { + accessToken: runToken ?? undefined, + enabled: Boolean(runId && runToken), + }); + + // Handle task completion + useEffect(() => { + if (!parseRun?.status) return; + + if (parseRun.status === 'COMPLETED') { + const output = parseRun.output as { + success: boolean; + questionnaireId: string; + questionsAndAnswers: { question: string; answer: string | null }[]; + } | undefined; + + if (output?.success && output.questionnaireId) { + setQuestionnaireId(output.questionnaireId); + toast.success( + `Successfully parsed ${output.questionsAndAnswers?.length ?? 0} questions`, + ); + router.push(`/${orgId}/questionnaire/${output.questionnaireId}`); + } else { + setIsParseProcessStarted(false); + toast.error('Failed to parse questionnaire'); + } + + setRunId(null); + setRunToken(null); + setUploadStatus('idle'); + setParseStatus('idle'); + } + + if (isFailureRunStatus(parseRun.status)) { + setIsParseProcessStarted(false); + setRunId(null); + setRunToken(null); + setUploadStatus('idle'); + setParseStatus('idle'); + toast.error('Questionnaire parsing failed. Please try again.'); + } + }, [ + parseRun?.status, + parseRun?.output, + orgId, + router, + setIsParseProcessStarted, + setQuestionnaireId, + ]); + const executeUploadAndParse = useCallback( async (input: { fileName: string; @@ -78,40 +133,45 @@ export function useQuestionnaireParse({ setParseStatus('executing'); try { - const response = await api.post<{ questionnaireId: string; totalQuestions: number }>( - '/v1/questionnaire/upload-and-parse', - { - organizationId: input.organizationId, - fileName: input.fileName, - fileType: input.fileType, - fileData: input.fileData, - source: 'internal', - }, - ); + const response = await api.post<{ + runId: string; + publicAccessToken: string; + }>('/v1/questionnaire/upload-and-parse', { + organizationId: input.organizationId, + fileName: input.fileName, + fileType: input.fileType, + fileData: input.fileData, + source: 'internal', + }); if (response.error || !response.data) { setIsParseProcessStarted(false); - toast.error(response.error || 'Failed to parse questionnaire'); + setUploadStatus('idle'); + setParseStatus('idle'); + toast.error(response.error || 'Failed to start questionnaire parsing'); return; } - const { questionnaireId, totalQuestions } = response.data; - setQuestionnaireId(questionnaireId); - toast.success(`Successfully parsed ${totalQuestions} questions`); - router.push(`/${orgId}/questionnaire/${questionnaireId}`); + // Upload done, now tracking async parsing + setUploadStatus('idle'); + setRunId(response.data.runId); + setRunToken(response.data.publicAccessToken); } catch (error) { if (error instanceof Error && error.name === 'AbortError') { return; // Request was cancelled } setIsParseProcessStarted(false); - console.error('Parse error:', error); - toast.error(error instanceof Error ? error.message : 'Failed to parse questionnaire'); - } finally { setUploadStatus('idle'); setParseStatus('idle'); + console.error('Parse error:', error); + toast.error( + error instanceof Error + ? error.message + : 'Failed to parse questionnaire', + ); } }, - [orgId, router, setIsParseProcessStarted, setQuestionnaireId], + [setIsParseProcessStarted], ); // Cleanup on unmount diff --git a/packages/integration-platform/src/manifests/google-workspace/__tests__/check-user-filter.test.ts b/packages/integration-platform/src/manifests/google-workspace/__tests__/check-user-filter.test.ts new file mode 100644 index 0000000000..d32a462d52 --- /dev/null +++ b/packages/integration-platform/src/manifests/google-workspace/__tests__/check-user-filter.test.ts @@ -0,0 +1,84 @@ +import { describe, expect, it } from 'bun:test'; +import { + filterGoogleWorkspaceUsersForChecks, + parseGoogleWorkspaceCheckUserFilter, + shouldIncludeGoogleWorkspaceUserForCheck, +} from '../check-user-filter'; +import type { GoogleWorkspaceUser } from '../types'; + +const baseUser = (overrides: Partial): GoogleWorkspaceUser => ({ + id: 'u1', + primaryEmail: 'a@example.com', + name: { givenName: 'A', familyName: 'B', fullName: 'A B' }, + isAdmin: false, + isDelegatedAdmin: false, + isEnrolledIn2Sv: true, + isEnforcedIn2Sv: false, + suspended: false, + archived: false, + creationTime: '', + lastLoginTime: '', + orgUnitPath: '/Staff', + ...overrides, +}); + +describe('parseGoogleWorkspaceCheckUserFilter', () => { + it('parses connection-style variables', () => { + const config = parseGoogleWorkspaceCheckUserFilter({ + target_org_units: ['/Staff'], + sync_excluded_emails: ['skip@example.com'], + sync_included_emails: [], + sync_user_filter_mode: 'exclude', + include_suspended: 'false', + }); + expect(config.targetOrgUnits).toEqual(['/Staff']); + expect(config.excludedTerms).toContain('skip@example.com'); + expect(config.userFilterMode).toBe('exclude'); + expect(config.includeSuspended).toBe(false); + }); +}); + +describe('shouldIncludeGoogleWorkspaceUserForCheck', () => { + it('drops users outside target OUs', () => { + const config = parseGoogleWorkspaceCheckUserFilter({ + target_org_units: ['/Engineering'], + include_suspended: 'false', + }); + expect( + shouldIncludeGoogleWorkspaceUserForCheck( + baseUser({ orgUnitPath: '/Sales' }), + config, + ), + ).toBe(false); + expect( + shouldIncludeGoogleWorkspaceUserForCheck( + baseUser({ orgUnitPath: '/Engineering/TeamA' }), + config, + ), + ).toBe(true); + }); + + it('respects exclude email terms', () => { + const config = parseGoogleWorkspaceCheckUserFilter({ + sync_excluded_emails: ['a@example.com'], + sync_user_filter_mode: 'exclude', + include_suspended: 'false', + }); + expect(shouldIncludeGoogleWorkspaceUserForCheck(baseUser({}), config)).toBe(false); + }); +}); + +describe('filterGoogleWorkspaceUsersForChecks', () => { + it('filters an array', () => { + const config = parseGoogleWorkspaceCheckUserFilter({ + sync_user_filter_mode: 'include', + sync_included_emails: ['keep@example.com'], + include_suspended: 'false', + }); + const users = [ + baseUser({ primaryEmail: 'keep@example.com' }), + baseUser({ id: 'u2', primaryEmail: 'drop@example.com' }), + ]; + expect(filterGoogleWorkspaceUsersForChecks(users, config)).toHaveLength(1); + }); +}); diff --git a/packages/integration-platform/src/manifests/google-workspace/check-user-filter.ts b/packages/integration-platform/src/manifests/google-workspace/check-user-filter.ts new file mode 100644 index 0000000000..04446131d5 --- /dev/null +++ b/packages/integration-platform/src/manifests/google-workspace/check-user-filter.ts @@ -0,0 +1,82 @@ +import { matchesSyncFilterTerms, parseSyncFilterTerms } from '../../sync-filter/email-exclusion-terms'; +import type { CheckVariableValues } from '../../types'; +import type { GoogleWorkspaceUser } from './types'; + +/** Sync mode for directory users — aligned with `sync_user_filter_mode` connection variables. */ +export type GoogleWorkspaceUserSyncFilterMode = 'all' | 'exclude' | 'include'; + +/** Parsed filter state shared by GWS checks (2FA, employee access) and aligned with employee sync. */ +export interface GoogleWorkspaceCheckUserFilterConfig { + targetOrgUnits: string[] | undefined; + excludedTerms: string[]; + includedTerms: string[]; + userFilterMode: GoogleWorkspaceUserSyncFilterMode | undefined; + includeSuspended: boolean; +} + +/** + * Reads integration variables into a filter config (org units, sync email include/exclude). + */ +export function parseGoogleWorkspaceCheckUserFilter( + variables: CheckVariableValues, +): GoogleWorkspaceCheckUserFilterConfig { + return { + targetOrgUnits: variables.target_org_units as string[] | undefined, + excludedTerms: parseSyncFilterTerms( + variables.sync_excluded_emails ?? variables.excluded_emails, + ), + includedTerms: parseSyncFilterTerms(variables.sync_included_emails), + userFilterMode: variables.sync_user_filter_mode as GoogleWorkspaceUserSyncFilterMode | undefined, + includeSuspended: variables.include_suspended === 'true', + }; +} + +/** + * Whether a directory user should be included in a GWS security check, using the same rules as + * `sync.controller.ts` employee sync (OU first, then email terms). + */ +export function shouldIncludeGoogleWorkspaceUserForCheck( + user: GoogleWorkspaceUser, + config: GoogleWorkspaceCheckUserFilterConfig, +): boolean { + if (user.suspended && !config.includeSuspended) { + return false; + } + + if (user.archived) { + return false; + } + + const { targetOrgUnits } = config; + if (targetOrgUnits && targetOrgUnits.length > 0) { + const userOu = user.orgUnitPath ?? '/'; + const inOrgUnit = targetOrgUnits.some( + (ou) => ou === '/' || userOu === ou || userOu.startsWith(`${ou}/`), + ); + if (!inOrgUnit) { + return false; + } + } + + const email = user.primaryEmail.toLowerCase(); + + if (config.userFilterMode === 'exclude' && config.excludedTerms.length > 0) { + return !matchesSyncFilterTerms(email, config.excludedTerms); + } + + if (config.userFilterMode === 'include') { + if (config.includedTerms.length === 0) { + return true; + } + return matchesSyncFilterTerms(email, config.includedTerms); + } + + return true; +} + +export function filterGoogleWorkspaceUsersForChecks( + users: GoogleWorkspaceUser[], + config: GoogleWorkspaceCheckUserFilterConfig, +): GoogleWorkspaceUser[] { + return users.filter((user) => shouldIncludeGoogleWorkspaceUserForCheck(user, config)); +} diff --git a/packages/integration-platform/src/manifests/google-workspace/checks/employee-access.ts b/packages/integration-platform/src/manifests/google-workspace/checks/employee-access.ts index 615648e7fb..b04a54f8c3 100644 --- a/packages/integration-platform/src/manifests/google-workspace/checks/employee-access.ts +++ b/packages/integration-platform/src/manifests/google-workspace/checks/employee-access.ts @@ -1,12 +1,16 @@ import { TASK_TEMPLATES } from '../../../task-mappings'; import type { CheckContext, IntegrationCheck } from '../../../types'; +import { + filterGoogleWorkspaceUsersForChecks, + parseGoogleWorkspaceCheckUserFilter, +} from '../check-user-filter'; import type { GoogleWorkspaceRoleAssignmentsResponse, GoogleWorkspaceRolesResponse, GoogleWorkspaceUser, GoogleWorkspaceUsersResponse, } from '../types'; -import { includeSuspendedVariable } from '../variables'; +import { includeSuspendedVariable, targetOrgUnitsVariable } from '../variables'; /** * Employee Access Review Check @@ -18,12 +22,12 @@ export const employeeAccessCheck: IntegrationCheck = { name: 'Employee Access Review', description: 'Fetch all employees and their roles from Google Workspace for access review', taskMapping: TASK_TEMPLATES.employeeAccess, - variables: [includeSuspendedVariable], + variables: [targetOrgUnitsVariable, includeSuspendedVariable], run: async (ctx: CheckContext) => { ctx.log('Starting Google Workspace Employee Access check'); - const includeSuspended = ctx.variables.include_suspended === 'true'; + const userFilterConfig = parseGoogleWorkspaceCheckUserFilter(ctx.variables); // Fetch all roles first to build a role ID -> name map ctx.log('Fetching available roles...'); @@ -123,16 +127,8 @@ export const employeeAccessCheck: IntegrationCheck = { ctx.log(`Fetched ${allUsers.length} total users`); - // Filter users - const activeUsers = allUsers.filter((user) => { - if (user.suspended && !includeSuspended) { - return false; - } - if (user.archived) { - return false; - } - return true; - }); + // Same rules as 2FA check and employee sync (sync.controller.ts) + const activeUsers = filterGoogleWorkspaceUsersForChecks(allUsers, userFilterConfig); ctx.log(`Found ${activeUsers.length} active users after filtering`); diff --git a/packages/integration-platform/src/manifests/google-workspace/checks/two-factor-auth.ts b/packages/integration-platform/src/manifests/google-workspace/checks/two-factor-auth.ts index e784ab03a9..3493f6dc37 100644 --- a/packages/integration-platform/src/manifests/google-workspace/checks/two-factor-auth.ts +++ b/packages/integration-platform/src/manifests/google-workspace/checks/two-factor-auth.ts @@ -1,6 +1,9 @@ import { TASK_TEMPLATES } from '../../../task-mappings'; import type { CheckContext, IntegrationCheck } from '../../../types'; -import { matchesSyncFilterTerms, parseSyncFilterTerms } from '../../../sync-filter/email-exclusion-terms'; +import { + filterGoogleWorkspaceUsersForChecks, + parseGoogleWorkspaceCheckUserFilter, +} from '../check-user-filter'; import type { GoogleWorkspaceUser, GoogleWorkspaceUsersResponse } from '../types'; import { includeSuspendedVariable, targetOrgUnitsVariable } from '../variables'; @@ -18,13 +21,7 @@ export const twoFactorAuthCheck: IntegrationCheck = { run: async (ctx: CheckContext) => { ctx.log('Starting Google Workspace 2FA check'); - const targetOrgUnits = ctx.variables.target_org_units as string[] | undefined; - const excludedTerms = parseSyncFilterTerms( - ctx.variables.sync_excluded_emails ?? ctx.variables.excluded_emails, - ); - const includedTerms = parseSyncFilterTerms(ctx.variables.sync_included_emails); - const userFilterMode = ctx.variables.sync_user_filter_mode as 'all' | 'exclude' | 'include' | undefined; - const includeSuspended = ctx.variables.include_suspended === 'true'; + const userFilterConfig = parseGoogleWorkspaceCheckUserFilter(ctx.variables); // Fetch all users with pagination const allUsers: GoogleWorkspaceUser[] = []; @@ -54,44 +51,8 @@ export const twoFactorAuthCheck: IntegrationCheck = { ctx.log(`Fetched ${allUsers.length} total users`); - // Filter users based on settings - const usersToCheck = allUsers.filter((user) => { - // Skip suspended users unless explicitly included - if (user.suspended && !includeSuspended) { - return false; - } - - // Skip archived users - if (user.archived) { - return false; - } - - // Org units first, then sync email filter — same order as employee sync (sync.controller.ts) - if (targetOrgUnits && targetOrgUnits.length > 0) { - const userOu = user.orgUnitPath ?? '/'; - const inOrgUnit = targetOrgUnits.some( - (ou) => ou === '/' || userOu === ou || userOu.startsWith(`${ou}/`), - ); - if (!inOrgUnit) { - return false; - } - } - - const email = user.primaryEmail.toLowerCase(); - - if (userFilterMode === 'exclude' && excludedTerms.length > 0) { - return !matchesSyncFilterTerms(email, excludedTerms); - } - - if (userFilterMode === 'include') { - if (includedTerms.length === 0) { - return true; - } - return matchesSyncFilterTerms(email, includedTerms); - } - - return true; - }); + // Org units + sync email filter — same rules as employee sync (sync.controller.ts) + const usersToCheck = filterGoogleWorkspaceUsersForChecks(allUsers, userFilterConfig); ctx.log(`Checking ${usersToCheck.length} users after filtering`);