Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions apps/api/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,9 @@ lerna-debug.log*
.env.production.local
.env.local

# Local scripts
scripts/

# temp directory
.temp
.tmp
Expand Down
6 changes: 3 additions & 3 deletions apps/api/src/questionnaire/questionnaire.controller.ts
Original file line number Diff line number Diff line change
Expand Up @@ -263,12 +263,12 @@ export class QuestionnaireController {
@ApiConsumes('application/json')
@ApiOkResponse({
description:
'Upload file, parse questions (no answers), save to DB, return questionnaireId',
'Upload file and trigger async parsing. Returns runId for realtime tracking.',
schema: {
type: 'object',
properties: {
questionnaireId: { type: 'string' },
totalQuestions: { type: 'number' },
runId: { type: 'string' },
publicAccessToken: { type: 'string' },
},
},
})
Expand Down
43 changes: 20 additions & 23 deletions apps/api/src/questionnaire/questionnaire.service.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@ import { Injectable, Logger, NotFoundException } from '@nestjs/common';
import type { AnswerQuestionResult } from '@/trigger/questionnaire/answer-question';
import { answerQuestion } from '@/trigger/questionnaire/answer-question';
import { generateAnswerWithRAGBatch } from '@/trigger/questionnaire/answer-question-helpers';
import { tasks } from '@trigger.dev/sdk';
import type { parseQuestionnaireTask } from '@/trigger/questionnaire/parse-questionnaire';
import { ParseQuestionnaireDto } from './dto/parse-questionnaire.dto';
import {
ExportQuestionnaireDto,
Expand Down Expand Up @@ -211,7 +213,8 @@ export class QuestionnaireService {

async uploadAndParse(
dto: UploadAndParseDto,
): Promise<{ questionnaireId: string; totalQuestions: number }> {
): Promise<{ runId: string; publicAccessToken: string }> {
// Upload file to S3 first
const uploadInfo = await uploadQuestionnaireFile({
organizationId: dto.organizationId,
fileName: dto.fileName,
Expand All @@ -220,38 +223,32 @@ export class QuestionnaireService {
source: dto.source || 'internal',
});

// Use AI-powered extraction (faster, handles all file formats)
const questionsAndAnswers = await extractQuestionsWithAI(
dto.fileData,
dto.fileType,
this.contentLogger,
);
if (!uploadInfo) {
throw new Error('Failed to upload questionnaire file to S3');
}

const questionnaireId = await persistQuestionnaireResult(
// Trigger async processing via Trigger.dev
const handle = await tasks.trigger<typeof parseQuestionnaireTask>(
'parse-questionnaire',
{
inputType: 's3' as const,
organizationId: dto.organizationId,
s3Key: uploadInfo.s3Key,
fileName: dto.fileName,
fileType: dto.fileType,
fileSize:
uploadInfo?.fileSize ?? Buffer.from(dto.fileData, 'base64').length,
s3Key: uploadInfo?.s3Key ?? null,
questionsAndAnswers: questionsAndAnswers.map((qa) => ({
question: qa.question,
answer: null,
sources: undefined,
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Source field lost in async task flow

Low Severity

The source field from UploadAndParseDto (which can be 'internal' or 'external') is no longer propagated to the parse-questionnaire task payload. The old synchronous flow explicitly passed source: dto.source || 'internal' to persistQuestionnaireResult, but the new async flow omits it entirely from the tasks.trigger call. The task's db.questionnaire.create also doesn't set source, relying on the Prisma @default("internal"). This silently drops the source value for any caller passing 'external'.

Additional Locations (1)
Fix in Cursor Fix in Web

})),
source: dto.source || 'internal',
fileSize: uploadInfo.fileSize,
},
this.storageLogger,
);

if (!questionnaireId) {
throw new Error('Failed to save questionnaire');
}
this.logger.log('Triggered async questionnaire parsing', {
runId: handle.id,
s3Key: uploadInfo.s3Key,
fileName: dto.fileName,
});

return {
questionnaireId,
totalQuestions: questionsAndAnswers.length,
runId: handle.id,
publicAccessToken: handle.publicAccessToken,
};
}

Expand Down
172 changes: 157 additions & 15 deletions apps/api/src/questionnaire/utils/content-extractor.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import { createGroq } from '@ai-sdk/groq';
import { generateText, generateObject, jsonSchema } from 'ai';
import ExcelJS from 'exceljs';
import AdmZip from 'adm-zip';
import mammoth from 'mammoth';
import { PARSING_MODEL, VISION_EXTRACTION_PROMPT } from './constants';

/**
Expand Down Expand Up @@ -82,20 +83,31 @@ export async function extractContentFromFile(
return fileBuffer.toString('utf-8');
}

// Handle Word documents - not directly supported
if (isWordDocument(fileType)) {
// Handle Word documents (.docx) — extract text with mammoth
if (isDocxFile(fileType)) {
const result = await mammoth.extractRawText({ buffer: fileBuffer });
return result.value;
}

// Legacy .doc files are not supported
if (fileType === 'application/msword') {
throw new Error(
'Word documents (.docx) are best converted to PDF or image format for parsing. Alternatively, use a URL to view the document.',
'Legacy Word documents (.doc) are not supported. Please convert to .docx or PDF format.',
);
}

// For images and PDFs, use OpenAI vision API
if (isImageOrPdf(fileType)) {
// Handle PDFs using Claude's native multi-page PDF support
if (isPdfFile(fileType)) {
return extractFromPdf(fileData, logger);
}

// Handle images using OpenAI vision API
if (isImageFile(fileType)) {
return extractFromVision(fileData, fileType, logger);
}

throw new Error(
`Unsupported file type: ${fileType}. Supported formats: PDF, images (PNG, JPG, etc.), Excel (.xlsx, .xls), CSV, text files (.txt).`,
`Unsupported file type: ${fileType}. Supported formats: PDF, Word (.docx), images (PNG, JPG, etc.), Excel (.xlsx, .xls), CSV, text files (.txt).`,
);
}

Expand Down Expand Up @@ -132,8 +144,24 @@ export async function extractQuestionsWithAI(
return await parseQuestionsWithGroq(content, logger);
}

// For PDF/images - use vision
if (isImageOrPdf(fileType)) {
// For Word documents (.docx) - extract text then AI parsing
if (isDocxFile(fileType)) {
const fileBuffer = Buffer.from(fileData, 'base64');
const result = await mammoth.extractRawText({ buffer: fileBuffer });
logger.info('Extracted DOCX content', {
contentLength: result.value.length,
extractionMs: Date.now() - startTime,
});
return await parseQuestionsWithGroq(result.value, logger);
}

// For PDFs - use Claude's native PDF support
if (isPdfFile(fileType)) {
return await parseQuestionsFromPdf(fileData, logger);
}

// For images - use OpenAI vision
if (isImageFile(fileType)) {
return await parseQuestionsWithVision(fileData, fileType, logger);
}

Expand Down Expand Up @@ -382,7 +410,7 @@ async function parseQuestionsWithClaude(
});

const { object } = await generateObject({
model: anthropic('claude-3-5-sonnet-latest'),
model: anthropic('claude-sonnet-4-6'),
schema: questionExtractionSchema,
prompt: QUESTION_PROMPT + content.substring(0, 80000),
});
Expand All @@ -394,7 +422,7 @@ async function parseQuestionsWithClaude(
logger.info('Claude parsing complete', {
questionCount: result.questions?.length || 0,
durationMs: Date.now() - startTime,
model: 'claude-3-5-sonnet',
model: 'claude-sonnet-4-6',
});

return (result.questions || [])
Expand Down Expand Up @@ -529,16 +557,19 @@ function isTextFile(fileType: string): boolean {
return fileType === 'text/plain' || fileType.startsWith('text/');
}

function isWordDocument(fileType: string): boolean {
function isDocxFile(fileType: string): boolean {
return (
fileType === 'application/msword' ||
fileType ===
'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
);
}

function isImageOrPdf(fileType: string): boolean {
return fileType.startsWith('image/') || fileType === 'application/pdf';
function isPdfFile(fileType: string): boolean {
return fileType === 'application/pdf';
}

function isImageFile(fileType: string): boolean {
return fileType.startsWith('image/');
}

/**
Expand Down Expand Up @@ -893,6 +924,117 @@ function extractFromCsv(fileBuffer: Buffer): string {
.join('\n');
}

/**
* Extract raw text content from a PDF using Claude's native multi-page support
*/
async function extractFromPdf(
fileData: string,
logger: ContentExtractionLogger,
): Promise<string> {
const fileSizeMB = (
Buffer.from(fileData, 'base64').length /
(1024 * 1024)
).toFixed(2);

logger.info('Extracting content from PDF using Claude', {
fileSizeMB,
});

const startTime = Date.now();

try {
const { text } = await generateText({
model: anthropic('claude-sonnet-4-6'),
messages: [
{
role: 'user',
content: [
{ type: 'text', text: VISION_EXTRACTION_PROMPT },
{
type: 'file',
data: fileData,
mediaType: 'application/pdf',
},
],
},
],
});

const extractionTime = ((Date.now() - startTime) / 1000).toFixed(2);
logger.info('Content extracted from PDF', {
extractedLength: text.length,
extractionTimeSeconds: extractionTime,
});

return text;
} catch (error) {
const extractionTime = ((Date.now() - startTime) / 1000).toFixed(2);
logger.error('Failed to extract content from PDF', {
fileSizeMB,
extractionTimeSeconds: extractionTime,
error: error instanceof Error ? error.message : 'Unknown error',
});
throw new Error(
`Failed to extract PDF content: ${error instanceof Error ? error.message : 'Unknown error'}`,
);
}
}

/**
* Extract questions directly from a PDF using Claude's native multi-page support
*/
async function parseQuestionsFromPdf(
fileData: string,
logger: ContentExtractionLogger,
): Promise<{ question: string; answer: string | null }[]> {
const startTime = Date.now();

const { object } = await generateObject({
model: anthropic('claude-sonnet-4-6'),
schema: questionExtractionSchema,
messages: [
{
role: 'user',
content: [
{
type: 'text',
text: `Extract all questions/fields and their answers from this questionnaire or form document.

Include:
- Traditional questions ending with "?"
- Form fields like "1.1 Vendor Name", "Contact Email" that request input
- Numbered items (1.1, 1.2, 2.1) followed by field labels
- Items marked with "*" or selection notes like "(Single selection allowed)"

Match each to its response if provided. Set answer to null if empty.`,
},
{
type: 'file',
data: fileData,
mediaType: 'application/pdf',
},
],
},
],
});

const result = object as {
questions: { question: string; answer: string | null }[];
};

logger.info('PDF question parsing complete', {
questionCount: result.questions?.length || 0,
durationMs: Date.now() - startTime,
});

return (result.questions || [])
.map((q) => ({
question: q.question?.trim() || '',
answer: q.answer?.trim() || null,
}))
.filter((q) => q.question);
}

async function extractFromVision(
fileData: string,
fileType: string,
Expand Down
9 changes: 6 additions & 3 deletions apps/api/src/trigger/questionnaire/parse-questionnaire.ts
Original file line number Diff line number Diff line change
Expand Up @@ -239,12 +239,14 @@ export const parseQuestionnaireTask = task({
retry: {
maxAttempts: 2,
},
maxDuration: 60 * 30, // 30 minutes (in seconds) for large PDF questionnaires
run: async (payload: {
inputType: 'file' | 'url' | 'attachment' | 's3';
organizationId: string;
fileData?: string;
fileName?: string;
fileType?: string;
fileSize?: number;
url?: string;
attachmentId?: string;
s3Key?: string;
Expand Down Expand Up @@ -342,9 +344,10 @@ export const parseQuestionnaireTask = task({
'questionnaire';
const s3Key = payload.s3Key || '';
const fileType = payload.fileType || 'application/octet-stream';
const fileSize = payload.fileData
? Buffer.from(payload.fileData, 'base64').length
: 0;
const fileSize = payload.fileSize
?? (payload.fileData
? Buffer.from(payload.fileData, 'base64').length
: 0);

const questionnaire = await db.questionnaire.create({
data: {
Expand Down
Loading
Loading