Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .prettierignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ dist
.vscode
**/.DS_Store
node_modules
deploy/
document/
*.md
*.mdx
Expand Down
1 change: 1 addition & 0 deletions document/content/docs/toc.en.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ description: FastGPT Toc
- [/en/docs/openapi/share](/en/docs/openapi/share)
- [/en/docs/self-host/config/json](/en/docs/self-host/config/json)
- [/en/docs/self-host/config/model/intro](/en/docs/self-host/config/model/intro)
- [/en/docs/self-host/config/model/minimax](/en/docs/self-host/config/model/minimax)
- [/en/docs/self-host/config/model/siliconCloud](/en/docs/self-host/config/model/siliconCloud)
- [/en/docs/self-host/config/object-storage](/en/docs/self-host/config/object-storage)
- [/en/docs/self-host/config/signoz](/en/docs/self-host/config/signoz)
Expand Down
1 change: 1 addition & 0 deletions document/content/docs/toc.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ description: FastGPT 文档目录
- [/docs/openapi/share](/docs/openapi/share)
- [/docs/self-host/config/json](/docs/self-host/config/json)
- [/docs/self-host/config/model/intro](/docs/self-host/config/model/intro)
- [/docs/self-host/config/model/minimax](/docs/self-host/config/model/minimax)
- [/docs/self-host/config/model/siliconCloud](/docs/self-host/config/model/siliconCloud)
- [/docs/self-host/config/object-storage](/docs/self-host/config/object-storage)
- [/docs/self-host/config/signoz](/docs/self-host/config/signoz)
Expand Down
50 changes: 19 additions & 31 deletions packages/service/common/file/read/utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,12 @@ import FormData from 'form-data';
import fs from 'fs';
import type { ReadFileResponse } from '../../../worker/readFile/type';
import { axios } from '../../api/axios';
import { batchRun } from '@fastgpt/global/common/system/utils';
import { matchMdImg } from '@fastgpt/global/common/string/markdown';
import { createPdfParseUsage } from '../../../support/wallet/usage/controller';
import { useDoc2xServer } from '../../../thirdProvider/doc2x';
import { useTextinServer } from '../../../thirdProvider/textin';
import { readRawContentFromBuffer } from '../../../worker/function';
import { uploadImage2S3Bucket } from '../../s3/utils';
import { Mimes } from '../../s3/constants';
import { uploadMdImagesToS3 } from '../../s3/utils';
import { getLogger, LogCategories } from '../../logger';

const logger = getLogger(LogCategories.MODULE.DATASET.FILE);
Expand Down Expand Up @@ -198,41 +196,31 @@ export const readFileContentByBuffer = async ({
logger.debug('File parsing completed', { extension, durationMs: Date.now() - start });

// markdown data format
if (imageList && imageList.length > 0) {
if (imageList && imageList.length > 0 && imageKeyOptions) {
logger.debug('Processing parsed document images', {
extension,
imageCount: imageList.length
});

await batchRun(imageList, async (item) => {
const src = await (async () => {
if (!imageKeyOptions) return '';
try {
const { prefix, expiredTime } = imageKeyOptions;
const ext = `.${item.mime.split('/')[1].replace('x-', '')}`;

return await uploadImage2S3Bucket('private', {
base64Img: `data:${item.mime};base64,${item.base64}`,
uploadKey: `${prefix}/${item.uuid}${ext}`,
mimetype: Mimes[ext as keyof typeof Mimes],
filename: `${item.uuid}${ext}`,
expiredTime
});
} catch (error) {
logger.warn('Failed to upload parsed image to S3', {
extension,
imageUuid: item.uuid,
error
});
return `[Image Upload Failed: ${item.uuid}]`;
}
})();
rawText = rawText.replace(item.uuid, src);
// rawText = rawText.replace(item.uuid, jwtSignS3ObjectKey(src, addDays(new Date(), 90)));
if (formatText) {
formatText = formatText.replace(item.uuid, src);
const replacements = await uploadMdImagesToS3({
imageList,
prefix: imageKeyOptions.prefix,
expiredTime: imageKeyOptions.expiredTime,
onError: (item, error) => {
logger.warn('Failed to upload parsed image to S3', {
extension,
imageUuid: item.uuid,
error
});
}
});

for (const [uuid, src] of replacements) {
rawText = rawText.replace(uuid, src);
if (formatText) {
formatText = formatText.replace(uuid, src);
}
}
}

return {
Expand Down
41 changes: 40 additions & 1 deletion packages/service/common/s3/utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ import { isAfter, differenceInSeconds } from 'date-fns';
import { ERROR_ENUM } from '@fastgpt/global/common/error/errorCode';
import type { ClientSession } from 'mongoose';
import { MongoS3TTL } from './schema';
import { S3Buckets } from './constants';
import { Mimes, S3Buckets } from './constants';
import { S3PrivateBucket } from './buckets/private';
import { S3Sources, type UploadImage2S3BucketParams } from './type';
import { S3PublicBucket } from './buckets/public';
Expand All @@ -12,6 +12,8 @@ import path from 'node:path';
import type { ParsedFileContentS3KeyParams } from './sources/dataset/type';
import { EndpointUrl } from '@fastgpt/global/common/file/constants';
import type { HelperBotTypeEnumType } from '@fastgpt/global/core/chat/helperBot/type';
import type { ImageType } from '../../worker/readFile/type';
import { batchRun } from '@fastgpt/global/common/system/utils';

// S3文件名最大长度配置
export const S3_FILENAME_MAX_LENGTH = 50;
Expand Down Expand Up @@ -287,6 +289,43 @@ export function isS3ObjectKey<T extends keyof typeof S3Sources>(
return typeof key === 'string' && key.startsWith(`${S3Sources[source]}/`);
}

/**
* Upload base64 images (extracted by matchMdImg) to S3 and replace UUID placeholders in text.
* Shared by file parsing (readFileContentByBuffer) and API dataset content (read.ts).
*/
export async function uploadMdImagesToS3({
imageList,
prefix,
expiredTime,
onError
}: {
imageList: ImageType[];
prefix: string;
expiredTime?: Date;
onError?: (item: ImageType, error: unknown) => void;
}): Promise<Map<string, string>> {
const replacements = new Map<string, string>();

await batchRun(imageList, async (item) => {
try {
const ext = `.${item.mime.split('/')[1].replace('x-', '')}`;
const src = await uploadImage2S3Bucket('private', {
base64Img: `data:${item.mime};base64,${item.base64}`,
uploadKey: `${prefix}/${item.uuid}${ext}`,
mimetype: Mimes[ext as keyof typeof Mimes],
filename: `${item.uuid}${ext}`,
expiredTime
});
replacements.set(item.uuid, src);
} catch (error) {
onError?.(item, error);
replacements.set(item.uuid, `[Image Upload Failed: ${item.uuid}]`);
}
});

return replacements;
}

export function sanitizeS3ObjectKey(key: string) {
// 替换掉圆括号
const replaceParentheses = (key: string) => {
Expand Down
108 changes: 55 additions & 53 deletions packages/service/core/dataset/apiDataset/feishuDataset/api.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,12 @@ import { type ParentIdType } from '@fastgpt/global/common/parentFolder/type';
import { type Method } from 'axios';
import { createProxyAxios, axios } from '../../../../common/api/axios';
import { getLogger, LogCategories } from '../../../../common/logger';
import { feishuDocToMarkdown } from './feishuDocToMarkdown';

type ResponseDataType = {
success: boolean;
message: string;
data: any;
data: unknown;
};

type FeishuFileListResponse = {
Expand All @@ -33,13 +34,17 @@ type FeishuFileListResponse = {
const feishuBaseUrl = process.env.FEISHU_BASE_URL || 'https://open.feishu.cn';
const logger = getLogger(LogCategories.MODULE.DATASET.API_DATASET);

/**
* Feishu/Lark dataset API.
* Uses doc2markdown (feishu2markdown) for doc→markdown with images; the caller
* (read.ts) reuses the KB image pipeline (matchMdImg → uploadImage2S3Bucket).
*/
export const useFeishuDatasetRequest = ({ feishuServer }: { feishuServer: FeishuServer }) => {
const instance = createProxyAxios({
baseURL: feishuBaseUrl,
timeout: 60000
});

// 添加请求拦截器
instance.interceptors.request.use(async (config) => {
if (!config.headers.Authorization) {
const { data } = await axios.post<{ tenant_access_token: string }>(
Expand All @@ -49,60 +54,46 @@ export const useFeishuDatasetRequest = ({ feishuServer }: { feishuServer: Feishu
app_secret: feishuServer.appSecret
}
);

config.headers['Authorization'] = `Bearer ${data.tenant_access_token}`;
config.headers['Content-Type'] = 'application/json; charset=utf-8';
}
return config;
});

/**
* 响应数据检查
*/
const checkRes = (data: ResponseDataType) => {
if (data === undefined) {
logger.warn('Feishu dataset response data is empty');
return Promise.reject('服务器异常');
}
return data.data;
};
const responseError = (err: any) => {
logger.error('Feishu dataset request failed', { error: err });

if (!err) {
return Promise.reject({ message: '未知错误' });
}
if (typeof err === 'string') {
return Promise.reject({ message: err });
}
if (typeof err.message === 'string') {
return Promise.reject({ message: err.message });
}
if (typeof err.data === 'string') {
return Promise.reject({ message: err.data });
}
if (err?.response?.data) {
return Promise.reject(err?.response?.data);
}
const responseError = (err: unknown) => {
logger.error('Feishu dataset request failed', { error: err });
if (!err) return Promise.reject({ message: '未知错误' });
if (typeof err === 'string') return Promise.reject({ message: err });
if (typeof (err as Error).message === 'string')
return Promise.reject({ message: (err as Error).message });
if (typeof (err as { data?: string })?.data === 'string')
return Promise.reject({ message: (err as { data: string }).data });
if ((err as { response?: { data?: unknown } })?.response?.data)
return Promise.reject((err as { response: { data: unknown } }).response.data);
return Promise.reject(err);
};

const request = <T>(url: string, data: any, method: Method): Promise<T> => {
/* 去空 */
for (const key in data) {
if (data[key] === undefined) {
delete data[key];
}
const request = <T>(url: string, data: Record<string, unknown>, method: Method): Promise<T> => {
const cleaned = { ...data };
for (const key of Object.keys(cleaned)) {
if (cleaned[key] === undefined) delete cleaned[key];
}

return instance
.request({
url,
method,
data: ['POST', 'PUT'].includes(method) ? data : undefined,
params: !['POST', 'PUT'].includes(method) ? data : undefined
data: ['POST', 'PUT'].includes(method) ? cleaned : undefined,
params: !['POST', 'PUT'].includes(method) ? cleaned : undefined
})
.then((res) => checkRes(res.data))
.then((res) => checkRes(res.data as ResponseDataType) as T)
.catch((err) => responseError(err));
};

Expand All @@ -113,25 +104,22 @@ export const useFeishuDatasetRequest = ({ feishuServer }: { feishuServer: Feishu
}): Promise<APIFileItemType[]> => {
const fetchFiles = async (pageToken?: string): Promise<FeishuFileListResponse['files']> => {
const data = await request<FeishuFileListResponse>(
`/open-apis/drive/v1/files`,
'/open-apis/drive/v1/files',
{
folder_token: parentId || feishuServer.folderToken,
page_size: 200,
page_token: pageToken
},
'GET'
);

if (data.has_more) {
const nextFiles = await fetchFiles(data.next_page_token);
return [...data.files, ...nextFiles];
}

return data.files;
};

const allFiles = await fetchFiles();

return allFiles
.filter((file) => ['folder', 'docx'].includes(file.type))
.map((file) => ({
Expand All @@ -146,12 +134,36 @@ export const useFeishuDatasetRequest = ({ feishuServer }: { feishuServer: Feishu
}));
};

/**
* Get document content as markdown with images (base64).
* Uses doc2markdown; on failure falls back to raw_content (text only, no images).
* Caller (read.ts) runs base64 images through the KB pipeline (matchMdImg → S3 upload).
*/
const getFileContent = async ({
apiFileId
}: {
apiFileId: string;
}): Promise<ApiFileReadContentResponse> => {
const [{ content }, { document }] = await Promise.all([
const result = await feishuDocToMarkdown({
baseUrl: feishuBaseUrl,
appId: feishuServer.appId,
appSecret: feishuServer.appSecret!,
docToken: apiFileId
});

if (result) {
return { title: result.title, rawText: result.markdown };
}

return getFileContentRaw({ apiFileId });
};

const getFileContentRaw = async ({
apiFileId
}: {
apiFileId: string;
}): Promise<ApiFileReadContentResponse> => {
const [contentRes, docRes] = await Promise.all([
request<{ content: string }>(
`/open-apis/docx/v1/documents/${apiFileId}/raw_content`,
{},
Expand All @@ -163,28 +175,21 @@ export const useFeishuDatasetRequest = ({ feishuServer }: { feishuServer: Feishu
'GET'
)
]);

return {
title: document?.title,
rawText: content
title: docRes?.document?.title,
rawText: contentRes?.content ?? ''
};
};

const getFilePreviewUrl = async ({ apiFileId }: { apiFileId: string }): Promise<string> => {
const { metas } = await request<{ metas: { url: string }[] }>(
`/open-apis/drive/v1/metas/batch_query`,
'/open-apis/drive/v1/metas/batch_query',
{
request_docs: [
{
doc_token: apiFileId,
doc_type: 'docx'
}
],
request_docs: [{ doc_token: apiFileId, doc_type: 'docx' }],
with_url: true
},
'POST'
);

return metas[0].url;
};

Expand All @@ -198,7 +203,6 @@ export const useFeishuDatasetRequest = ({ feishuServer }: { feishuServer: Feishu
{},
'GET'
);

return {
rawId: apiFileId,
name: document?.title,
Expand All @@ -211,9 +215,7 @@ export const useFeishuDatasetRequest = ({ feishuServer }: { feishuServer: Feishu
};
};

const getFileRawId = (fileId: string) => {
return fileId;
};
const getFileRawId = (fileId: string) => fileId;

return {
getFileContent,
Expand Down
Loading