trycompai · tofikwest · Mar 24, 2026 · Mar 23, 2026 · Mar 23, 2026 · Mar 23, 2026
diff --git a/apps/api/.gitignore b/apps/api/.gitignore
@@ -47,6 +47,9 @@ lerna-debug.log*
 .env.production.local
 .env.local
 
+# Local scripts
+scripts/
+
 # temp directory
 .temp
 .tmp

diff --git a/apps/api/src/questionnaire/questionnaire.controller.ts b/apps/api/src/questionnaire/questionnaire.controller.ts
@@ -263,12 +263,12 @@ export class QuestionnaireController {
   @ApiConsumes('application/json')
   @ApiOkResponse({
     description:
-      'Upload file, parse questions (no answers), save to DB, return questionnaireId',
+      'Upload file and trigger async parsing. Returns runId for realtime tracking.',
     schema: {
       type: 'object',
       properties: {
-        questionnaireId: { type: 'string' },
-        totalQuestions: { type: 'number' },
+        runId: { type: 'string' },
+        publicAccessToken: { type: 'string' },
       },
     },
   })

diff --git a/apps/api/src/questionnaire/questionnaire.service.ts b/apps/api/src/questionnaire/questionnaire.service.ts
@@ -2,6 +2,8 @@ import { Injectable, Logger, NotFoundException } from '@nestjs/common';
 import type { AnswerQuestionResult } from '@/trigger/questionnaire/answer-question';
 import { answerQuestion } from '@/trigger/questionnaire/answer-question';
 import { generateAnswerWithRAGBatch } from '@/trigger/questionnaire/answer-question-helpers';
+import { tasks } from '@trigger.dev/sdk';
+import type { parseQuestionnaireTask } from '@/trigger/questionnaire/parse-questionnaire';
 import { ParseQuestionnaireDto } from './dto/parse-questionnaire.dto';
 import {
   ExportQuestionnaireDto,
@@ -211,7 +213,8 @@ export class QuestionnaireService {
 
   async uploadAndParse(
     dto: UploadAndParseDto,
-  ): Promise<{ questionnaireId: string; totalQuestions: number }> {
+  ): Promise<{ runId: string; publicAccessToken: string }> {
+    // Upload file to S3 first
     const uploadInfo = await uploadQuestionnaireFile({
       organizationId: dto.organizationId,
       fileName: dto.fileName,
@@ -220,38 +223,32 @@ export class QuestionnaireService {
       source: dto.source || 'internal',
     });
 
-    // Use AI-powered extraction (faster, handles all file formats)
-    const questionsAndAnswers = await extractQuestionsWithAI(
-      dto.fileData,
-      dto.fileType,
-      this.contentLogger,
-    );
+    if (!uploadInfo) {
+      throw new Error('Failed to upload questionnaire file to S3');
+    }
 
-    const questionnaireId = await persistQuestionnaireResult(
+    // Trigger async processing via Trigger.dev
+    const handle = await tasks.trigger<typeof parseQuestionnaireTask>(
+      'parse-questionnaire',
       {
+        inputType: 's3' as const,
         organizationId: dto.organizationId,
+        s3Key: uploadInfo.s3Key,
         fileName: dto.fileName,
         fileType: dto.fileType,
-        fileSize:
-          uploadInfo?.fileSize ?? Buffer.from(dto.fileData, 'base64').length,
-        s3Key: uploadInfo?.s3Key ?? null,
-        questionsAndAnswers: questionsAndAnswers.map((qa) => ({
-          question: qa.question,
-          answer: null,
-          sources: undefined,
-        })),
-        source: dto.source || 'internal',
+        fileSize: uploadInfo.fileSize,
       },
-      this.storageLogger,
     );
 
-    if (!questionnaireId) {
-      throw new Error('Failed to save questionnaire');
-    }
+    this.logger.log('Triggered async questionnaire parsing', {
+      runId: handle.id,
+      s3Key: uploadInfo.s3Key,
+      fileName: dto.fileName,
+    });
 
     return {
-      questionnaireId,
-      totalQuestions: questionsAndAnswers.length,
+      runId: handle.id,
+      publicAccessToken: handle.publicAccessToken,
     };
   }
 

diff --git a/apps/api/src/questionnaire/utils/content-extractor.ts b/apps/api/src/questionnaire/utils/content-extractor.ts
@@ -4,6 +4,7 @@ import { createGroq } from '@ai-sdk/groq';
 import { generateText, generateObject, jsonSchema } from 'ai';
 import ExcelJS from 'exceljs';
 import AdmZip from 'adm-zip';
+import mammoth from 'mammoth';
 import { PARSING_MODEL, VISION_EXTRACTION_PROMPT } from './constants';
 
 /**
@@ -82,20 +83,31 @@ export async function extractContentFromFile(
     return fileBuffer.toString('utf-8');
   }
 
-  // Handle Word documents - not directly supported
-  if (isWordDocument(fileType)) {
+  // Handle Word documents (.docx) — extract text with mammoth
+  if (isDocxFile(fileType)) {
+    const result = await mammoth.extractRawText({ buffer: fileBuffer });
+    return result.value;
+  }
+
+  // Legacy .doc files are not supported
+  if (fileType === 'application/msword') {
     throw new Error(
-      'Word documents (.docx) are best converted to PDF or image format for parsing. Alternatively, use a URL to view the document.',
+      'Legacy Word documents (.doc) are not supported. Please convert to .docx or PDF format.',
     );
   }
 
-  // For images and PDFs, use OpenAI vision API
-  if (isImageOrPdf(fileType)) {
+  // Handle PDFs using Claude's native multi-page PDF support
+  if (isPdfFile(fileType)) {
+    return extractFromPdf(fileData, logger);
+  }
+
+  // Handle images using OpenAI vision API
+  if (isImageFile(fileType)) {
     return extractFromVision(fileData, fileType, logger);
   }
 
   throw new Error(
-    `Unsupported file type: ${fileType}. Supported formats: PDF, images (PNG, JPG, etc.), Excel (.xlsx, .xls), CSV, text files (.txt).`,
+    `Unsupported file type: ${fileType}. Supported formats: PDF, Word (.docx), images (PNG, JPG, etc.), Excel (.xlsx, .xls), CSV, text files (.txt).`,
   );
 }
 
@@ -132,8 +144,24 @@ export async function extractQuestionsWithAI(
       return await parseQuestionsWithGroq(content, logger);
     }
 
-    // For PDF/images - use vision
-    if (isImageOrPdf(fileType)) {
+    // For Word documents (.docx) - extract text then AI parsing
+    if (isDocxFile(fileType)) {
+      const fileBuffer = Buffer.from(fileData, 'base64');
+      const result = await mammoth.extractRawText({ buffer: fileBuffer });
+      logger.info('Extracted DOCX content', {
+        contentLength: result.value.length,
+        extractionMs: Date.now() - startTime,
+      });
+      return await parseQuestionsWithGroq(result.value, logger);
+    }
+
+    // For PDFs - use Claude's native PDF support
+    if (isPdfFile(fileType)) {
+      return await parseQuestionsFromPdf(fileData, logger);
+    }
+
+    // For images - use OpenAI vision
+    if (isImageFile(fileType)) {
       return await parseQuestionsWithVision(fileData, fileType, logger);
     }
 
@@ -382,7 +410,7 @@ async function parseQuestionsWithClaude(
     });
 
     const { object } = await generateObject({
-      model: anthropic('claude-3-5-sonnet-latest'),
+      model: anthropic('claude-sonnet-4-6'),
       schema: questionExtractionSchema,
       prompt: QUESTION_PROMPT + content.substring(0, 80000),
     });
@@ -394,7 +422,7 @@ async function parseQuestionsWithClaude(
     logger.info('Claude parsing complete', {
       questionCount: result.questions?.length || 0,
       durationMs: Date.now() - startTime,
-      model: 'claude-3-5-sonnet',
+      model: 'claude-sonnet-4-6',
     });
 
     return (result.questions || [])
@@ -529,16 +557,19 @@ function isTextFile(fileType: string): boolean {
   return fileType === 'text/plain' || fileType.startsWith('text/');
 }
 
-function isWordDocument(fileType: string): boolean {
+function isDocxFile(fileType: string): boolean {
   return (
-    fileType === 'application/msword' ||
     fileType ===
-      'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
+    'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
   );
 }
 
-function isImageOrPdf(fileType: string): boolean {
-  return fileType.startsWith('image/') || fileType === 'application/pdf';
+function isPdfFile(fileType: string): boolean {
+  return fileType === 'application/pdf';
+}
+
+function isImageFile(fileType: string): boolean {
+  return fileType.startsWith('image/');
 }
 
 /**
@@ -893,6 +924,117 @@ function extractFromCsv(fileBuffer: Buffer): string {
     .join('\n');
 }
 
+/**
+ * Extract raw text content from a PDF using Claude's native multi-page support
+ */
+async function extractFromPdf(
+  fileData: string,
+  logger: ContentExtractionLogger,
+): Promise<string> {
+  const fileSizeMB = (
+    Buffer.from(fileData, 'base64').length /
+    (1024 * 1024)
+  ).toFixed(2);
+
+  logger.info('Extracting content from PDF using Claude', {
+    fileSizeMB,
+  });
+
+  const startTime = Date.now();
+
+  try {
+    const { text } = await generateText({
+      model: anthropic('claude-sonnet-4-6'),
+      messages: [
+        {
+          role: 'user',
+          content: [
+            { type: 'text', text: VISION_EXTRACTION_PROMPT },
+            {
+              type: 'file',
+              data: fileData,
+              mediaType: 'application/pdf',
+            },
+          ],
+        },
+      ],
+    });
+
+    const extractionTime = ((Date.now() - startTime) / 1000).toFixed(2);
+    logger.info('Content extracted from PDF', {
+      extractedLength: text.length,
+      extractionTimeSeconds: extractionTime,
+    });
+
+    return text;
+  } catch (error) {
+    const extractionTime = ((Date.now() - startTime) / 1000).toFixed(2);
+    logger.error('Failed to extract content from PDF', {
+      fileSizeMB,
+      extractionTimeSeconds: extractionTime,
+      error: error instanceof Error ? error.message : 'Unknown error',
+    });
+    throw new Error(
+      `Failed to extract PDF content: ${error instanceof Error ? error.message : 'Unknown error'}`,
+    );
+  }
+}
+
+/**
+ * Extract questions directly from a PDF using Claude's native multi-page support
+ */
+async function parseQuestionsFromPdf(
+  fileData: string,
+  logger: ContentExtractionLogger,
+): Promise<{ question: string; answer: string | null }[]> {
+  const startTime = Date.now();
+
+  const { object } = await generateObject({
+    model: anthropic('claude-sonnet-4-6'),
+    schema: questionExtractionSchema,
+    messages: [
+      {
+        role: 'user',
+        content: [
+          {
+            type: 'text',
+            text: `Extract all questions/fields and their answers from this questionnaire or form document.
+
+Include:
+- Traditional questions ending with "?"
+- Form fields like "1.1 Vendor Name", "Contact Email" that request input
+- Numbered items (1.1, 1.2, 2.1) followed by field labels
+- Items marked with "*" or selection notes like "(Single selection allowed)"
+
+Match each to its response if provided. Set answer to null if empty.`,
+          },
+          {
+            type: 'file',
+            data: fileData,
+            mediaType: 'application/pdf',
+          },
+        ],
+      },
+    ],
+  });
+
+  const result = object as {
+    questions: { question: string; answer: string | null }[];
+  };
+
+  logger.info('PDF question parsing complete', {
+    questionCount: result.questions?.length || 0,
+    durationMs: Date.now() - startTime,
+  });
+
+  return (result.questions || [])
+    .map((q) => ({
+      question: q.question?.trim() || '',
+      answer: q.answer?.trim() || null,
+    }))
+    .filter((q) => q.question);
+}
+
 async function extractFromVision(
   fileData: string,
   fileType: string,

diff --git a/apps/api/src/trigger/questionnaire/parse-questionnaire.ts b/apps/api/src/trigger/questionnaire/parse-questionnaire.ts
@@ -239,12 +239,14 @@ export const parseQuestionnaireTask = task({
   retry: {
     maxAttempts: 2,
   },
+  maxDuration: 60 * 30, // 30 minutes (in seconds) for large PDF questionnaires
   run: async (payload: {
     inputType: 'file' | 'url' | 'attachment' | 's3';
     organizationId: string;
     fileData?: string;
     fileName?: string;
     fileType?: string;
+    fileSize?: number;
     url?: string;
     attachmentId?: string;
     s3Key?: string;
@@ -342,9 +344,10 @@ export const parseQuestionnaireTask = task({
           'questionnaire';
         const s3Key = payload.s3Key || '';
         const fileType = payload.fileType || 'application/octet-stream';
-        const fileSize = payload.fileData
-          ? Buffer.from(payload.fileData, 'base64').length
-          : 0;
+        const fileSize = payload.fileSize
+          ?? (payload.fileData
+            ? Buffer.from(payload.fileData, 'base64').length
+            : 0);
 
         const questionnaire = await db.questionnaire.create({
           data: {