Skip to content
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
import { filterUsersByOrgUnits } from './sync-ou-filter';

interface TestUser {
primaryEmail: string;
orgUnitPath: string;
suspended?: boolean;
}

describe('filterUsersByOrgUnits', () => {
const users: TestUser[] = [
{ primaryEmail: 'alice@example.com', orgUnitPath: '/' },
{ primaryEmail: 'bob@example.com', orgUnitPath: '/Engineering' },
{ primaryEmail: 'carol@example.com', orgUnitPath: '/Engineering/Frontend' },
{ primaryEmail: 'dave@example.com', orgUnitPath: '/Marketing' },
{ primaryEmail: 'eve@example.com', orgUnitPath: '/HR' },
{
primaryEmail: 'frank@example.com',
orgUnitPath: '/Unlisted',
suspended: true,
},
];

it('returns all users when no target OUs specified', () => {
const result = filterUsersByOrgUnits(users, undefined);
expect(result).toEqual(users);
});

it('returns all users when target OUs is an empty array', () => {
const result = filterUsersByOrgUnits(users, []);
expect(result).toEqual(users);
});

it('filters users to only those in selected OUs', () => {
const result = filterUsersByOrgUnits(users, ['/Engineering']);
expect(result.map((u) => u.primaryEmail)).toEqual([
'bob@example.com',
'carol@example.com',
]);
});

it('includes users in child OUs of selected OUs', () => {
const result = filterUsersByOrgUnits(users, ['/Engineering']);
expect(result.map((u) => u.primaryEmail)).toContain('carol@example.com');
});

it('exact match on OU path works', () => {
const result = filterUsersByOrgUnits(users, ['/Engineering/Frontend']);
expect(result.map((u) => u.primaryEmail)).toEqual(['carol@example.com']);
});

it('supports multiple target OUs', () => {
const result = filterUsersByOrgUnits(users, ['/Engineering', '/Marketing']);
expect(result.map((u) => u.primaryEmail)).toEqual([
'bob@example.com',
'carol@example.com',
'dave@example.com',
]);
});

it('root OU includes all users', () => {
const result = filterUsersByOrgUnits(users, ['/']);
expect(result).toEqual(users);
});

it('excludes users not in any selected OU', () => {
const result = filterUsersByOrgUnits(users, ['/Engineering']);
const emails = result.map((u) => u.primaryEmail);
expect(emails).not.toContain('alice@example.com');
expect(emails).not.toContain('dave@example.com');
expect(emails).not.toContain('eve@example.com');
expect(emails).not.toContain('frank@example.com');
});

it('does not match partial OU path names', () => {
// /Eng should NOT match /Engineering
const result = filterUsersByOrgUnits(users, ['/Eng']);
expect(result).toEqual([]);
});

it('preserves suspended user status through filtering', () => {
const result = filterUsersByOrgUnits(users, ['/Unlisted']);
expect(result).toEqual([
{
primaryEmail: 'frank@example.com',
orgUnitPath: '/Unlisted',
suspended: true,
},
]);
});
});
23 changes: 23 additions & 0 deletions apps/api/src/integration-platform/controllers/sync-ou-filter.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
/**
* Filters users by organizational unit paths.
* Matches users whose orgUnitPath equals or is a child of any target OU.
*
* @param users - Array of objects with an orgUnitPath property
* @param targetOrgUnits - Array of OU paths to include (undefined/empty = all users)
* @returns Filtered array of users
*/
export function filterUsersByOrgUnits<
T extends { orgUnitPath?: string },
>(users: T[], targetOrgUnits: string[] | undefined): T[] {
if (!targetOrgUnits || targetOrgUnits.length === 0) {
return users;
}

return users.filter((user) => {
const userOu = user.orgUnitPath ?? '/';
return targetOrgUnits.some(
(ou) =>
ou === '/' || userOu === ou || userOu.startsWith(`${ou}/`),
);
});
}
24 changes: 19 additions & 5 deletions apps/api/src/integration-platform/controllers/sync.controller.ts
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ import {
import { RampRoleMappingService } from '../services/ramp-role-mapping.service';
import { IntegrationSyncLoggerService } from '../services/integration-sync-logger.service';
import { RampApiService } from '../services/ramp-api.service';
import { filterUsersByOrgUnits } from './sync-ou-filter';

interface GoogleWorkspaceUser {
id: string;
Expand Down Expand Up @@ -284,6 +285,19 @@ export class SyncController {
string,
unknown
>;

// Filter by organizational unit if configured
const targetOrgUnits = Array.isArray(syncVariables.target_org_units)
? (syncVariables.target_org_units as string[])
: undefined;
const ouFilteredUsers = filterUsersByOrgUnits(users, targetOrgUnits);

if (targetOrgUnits && targetOrgUnits.length > 0) {
this.logger.log(
`Google Workspace OU filter kept ${ouFilteredUsers.length}/${users.length} users (OUs: ${targetOrgUnits.join(', ')})`,
);
}

const rawSyncFilterMode = syncVariables.sync_user_filter_mode;
const syncFilterMode: GoogleWorkspaceSyncFilterMode =
typeof rawSyncFilterMode === 'string' &&
Expand All @@ -307,7 +321,7 @@ export class SyncController {
effectiveSyncFilterMode = 'all';
}

const filteredUsers = users.filter((user) => {
const filteredUsers = ouFilteredUsers.filter((user) => {
const email = user.primaryEmail.toLowerCase();

if (effectiveSyncFilterMode === 'exclude' && excludedTerms.length > 0) {
Expand All @@ -322,7 +336,7 @@ export class SyncController {
});

this.logger.log(
`Google Workspace sync filter mode "${effectiveSyncFilterMode}" kept ${filteredUsers.length}/${users.length} users`,
`Google Workspace sync filter mode "${effectiveSyncFilterMode}" kept ${filteredUsers.length}/${ouFilteredUsers.length} users`,
);

// Active users to import/reactivate are based on the selected filter mode
Expand All @@ -336,10 +350,10 @@ export class SyncController {
activeUsers.map((u) => u.primaryEmail.toLowerCase()),
);
const allSuspendedEmails = new Set(
users.filter((u) => u.suspended).map((u) => u.primaryEmail.toLowerCase()),
ouFilteredUsers.filter((u) => u.suspended).map((u) => u.primaryEmail.toLowerCase()),
);
const allActiveEmails = new Set(
users
ouFilteredUsers
.filter((u) => !u.suspended)
.map((u) => u.primaryEmail.toLowerCase()),
);
Expand Down Expand Up @@ -467,7 +481,7 @@ export class SyncController {

const deactivationGwDomains =
effectiveSyncFilterMode === 'include'
? new Set(users.map((u) => u.primaryEmail.split('@')[1]?.toLowerCase()))
? new Set(ouFilteredUsers.map((u) => u.primaryEmail.split('@')[1]?.toLowerCase()))
: new Set(
filteredUsers.map((u) =>
u.primaryEmail.split('@')[1]?.toLowerCase(),
Expand Down
2 changes: 1 addition & 1 deletion apps/api/src/trigger/policies/update-policy.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ import { logger, metadata, queue, schemaTask } from '@trigger.dev/sdk';
import { z } from 'zod';
import { processPolicyUpdate } from './update-policy-helpers';

export const updatePolicyQueue = queue({ name: 'update-policy', concurrencyLimit: 50 });
const updatePolicyQueue = queue({ name: 'update-policy', concurrencyLimit: 50 });

export const updatePolicy = schemaTask({
id: 'update-policy',
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ import Firecrawl from '@mendable/firecrawl-js';
import { logger } from '@trigger.dev/sdk';
import { vendorRiskAssessmentAgentSchema } from './agent-schema';
import type { VendorRiskAssessmentDataV1 } from './agent-types';
import { extractVendorDomain, validateVendorUrl } from './url-validation';

function normalizeUrl(url: string | null | undefined): string | null {
if (!url) return null;
Expand Down Expand Up @@ -58,19 +59,30 @@ export async function firecrawlAgentVendorRiskAssessment(params: {

const firecrawlClient = new Firecrawl({ apiKey });

// Extract vendor domain for URL validation
const vendorDomain = extractVendorDomain(vendorWebsite);
if (!vendorDomain) {
logger.warn('Could not extract vendor domain for URL validation', {
vendorWebsite,
});
return null;
}

const prompt = `Complete cyber security research on the vendor "${vendorName}" with website ${vendorWebsite}.

CRITICAL: Only return URLs that belong to the domain "${vendorDomain}" or its subdomains (e.g., trust.${vendorDomain}, security.${vendorDomain}). Do NOT return URLs from any other domain. If you cannot find a page on ${vendorDomain}, return an empty string for that field rather than a URL from another website.

Extract the following information:
1. **Certifications**: Find any security certifications they have (SOC 2 Type I, SOC 2 Type II, ISO 27001 etc). For each certification found, determine:
- The type of certification
- Whether it's verified/current, expired, or not certified
- Any issue or expiry dates mentioned
- Link to the compliance/trust page or report if available
- Link to the compliance/trust page or report if available (must be on ${vendorDomain})

2. **Legal & Security Documents**: Find the direct URLs to:
2. **Legal & Security Documents**: Find the direct URLs on ${vendorDomain} to:
- Privacy Policy page (usually at /privacy, /privacy-policy, or linked in the footer)
- Terms of Service page (usually at /terms, /tos, /terms-of-service, or linked in the footer)
- Trust Center or Security page (typically could be at /trust, /security or trust.website.com or security.website.com)
- Trust Center or Security page (typically could be at /trust, /security or trust.${vendorDomain} or security.${vendorDomain})

3. **Recent News**: Find recent news articles (last 12 months) about the company, especially:
- Security incidents or data breaches
Expand All @@ -81,13 +93,26 @@ Extract the following information:

4. **Summary**: Provide an overall assessment of the vendor's security posture.

Focus on their official website (especially trust/security/compliance pages), press releases, and reputable news sources.`;
Focus on their official website ${vendorWebsite} (especially trust/security/compliance pages), press releases, and reputable news sources.`;

// Provide seed URLs covering common legal/security paths so the agent
// stays on the vendor's domain instead of wandering to unrelated sites.
const seedUrls = [
origin,
`${origin}/privacy`,
`${origin}/privacy-policy`,
`${origin}/terms`,
`${origin}/terms-of-service`,
`${origin}/security`,
`${origin}/trust`,
`${origin}/legal`,
`${origin}/compliance`,
];

// Using SDK (no maxCredits override, no explicit polling here)
// Important: avoid crawling huge sites with a wildcard (e.g. workspace.google.com).
const agentResponse = await firecrawlClient.agent({
prompt,
urls: [origin],
urls: seedUrls,
strictConstrainToURLs: false, // allow following links from seed URLs, but seeds anchor it to the right domain
schema: {
type: 'object',
properties: {
Expand Down Expand Up @@ -173,7 +198,10 @@ Focus on their official website (especially trust/security/compliance pages), pr
});

const normalizedLinks = linkPairs
.map((l) => ({ ...l, url: normalizeUrl(l.url) }))
.map((l) => ({
...l,
url: validateVendorUrl(l.url, vendorDomain, l.label),
}))
.filter((l): l is { label: string; url: string } => Boolean(l.url));

const certifications =
Expand All @@ -182,7 +210,7 @@ Focus on their official website (especially trust/security/compliance pages), pr
status: c.status ?? 'unknown',
issuedAt: normalizeIso(c.issued_at ?? null),
expiresAt: normalizeIso(c.expires_at ?? null),
url: normalizeUrl(c.url ?? null),
url: validateVendorUrl(c.url ?? null, vendorDomain, `cert:${c.type}`),
})) ?? [];

const news =
Expand Down
58 changes: 51 additions & 7 deletions apps/api/src/trigger/vendor/vendor-risk-assessment/firecrawl.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import { logger } from '@trigger.dev/sdk';
import { firecrawlVendorDataSchema, type FirecrawlVendorData } from './schema';
import { extractVendorDomain, isUrlFromVendorDomain } from './url-validation';

type FirecrawlStartResponse = {
success: boolean;
Expand Down Expand Up @@ -48,6 +49,14 @@ function normalizeUrl(url: string | null | undefined): string | null {
export async function firecrawlExtractVendorData(
website: string,
): Promise<FirecrawlVendorData | null> {
// Extract vendor domain for URL validation
const vendorDomain = extractVendorDomain(website);
if (!vendorDomain) {
logger.warn('Could not extract vendor domain for URL validation', {
website,
});
return null;
}
const apiKey = process.env.FIRECRAWL_API_KEY;
if (!apiKey) {
logger.warn(
Expand Down Expand Up @@ -77,9 +86,11 @@ export async function firecrawlExtractVendorData(

Goal: return the MOST SPECIFIC, DIRECT URL for each document type below. Do not return general category pages.

You may crawl the site (including subdomains) and follow internal links. Trust portals are often linked in the header/footer under: "Trust", "Trust Center", "Security", "Compliance", "Legal", "Governance", "Privacy", "Data Processing", "DPA".
CRITICAL: Only return URLs that belong to the domain "${vendorDomain}" or its subdomains (e.g., trust.${vendorDomain}, security.${vendorDomain}). Do NOT return URLs from any other domain. If you cannot find a page on ${vendorDomain}, return an empty string for that field.

Return ONLY absolute https URLs. If you cannot find a dedicated page that matches the definition, return an empty string.
You may crawl the site (including subdomains of ${vendorDomain}) and follow internal links. Trust portals are often linked in the header/footer under: "Trust", "Trust Center", "Security", "Compliance", "Legal", "Governance", "Privacy", "Data Processing", "DPA".

Return ONLY absolute https URLs on ${vendorDomain}. If you cannot find a dedicated page that matches the definition, return an empty string.

DEFINITIONS (be strict):
1) trust_portal_url:
Expand Down Expand Up @@ -193,13 +204,46 @@ When multiple candidates exist, choose the most direct URL that best matches the
return null;
}

// Normalize URLs and filter out any that don't belong to the vendor's domain
const validateVendorUrl = (
url: string | null | undefined,
label: string,
): string | null => {
const normalized = normalizeUrl(url);
if (!normalized) return null;
if (!isUrlFromVendorDomain(normalized, vendorDomain)) {
logger.warn('Filtered out URL from wrong domain', {
vendorDomain,
label,
url: normalized,
});
return null;
}
return normalized;
};

const normalized = {
...parsed.data,
privacy_policy_url: normalizeUrl(parsed.data.privacy_policy_url),
terms_of_service_url: normalizeUrl(parsed.data.terms_of_service_url),
security_overview_url: normalizeUrl(parsed.data.security_overview_url),
trust_portal_url: normalizeUrl(parsed.data.trust_portal_url),
soc2_report_url: normalizeUrl(parsed.data.soc2_report_url),
privacy_policy_url: validateVendorUrl(
parsed.data.privacy_policy_url,
'privacy_policy',
),
terms_of_service_url: validateVendorUrl(
parsed.data.terms_of_service_url,
'terms_of_service',
),
security_overview_url: validateVendorUrl(
parsed.data.security_overview_url,
'security_overview',
),
trust_portal_url: validateVendorUrl(
parsed.data.trust_portal_url,
'trust_portal',
),
soc2_report_url: validateVendorUrl(
parsed.data.soc2_report_url,
'soc2_report',
),
};

logger.info('Firecrawl extraction completed', {
Expand Down
Loading
Loading