diff --git a/docSite/content/docs/development/docker.md b/docSite/content/docs/development/docker.md index 2388d2d5c30..840701627b1 100644 --- a/docSite/content/docs/development/docker.md +++ b/docSite/content/docs/development/docker.md @@ -17,6 +17,11 @@ weight: 707 | 500w 组向量 | 8c32g | 16c64g 200GB | {{< /table >}} +## 部署架构图 + +![](/imgs/sealos-fastgpt.webp) + + ### 1. 准备好代理环境(国外服务器可忽略) 确保可以访问 OpenAI,具体方案可以参考:[代理方案](/docs/development/proxy/)。或直接在 Sealos 上 [部署 OneAPI](/docs/development/one-api),既解决代理问题也能实现多 Key 轮询、接入其他大模型。 diff --git a/docSite/content/docs/development/qa.md b/docSite/content/docs/development/qa.md index 3784c333750..57f01e60611 100644 --- a/docSite/content/docs/development/qa.md +++ b/docSite/content/docs/development/qa.md @@ -19,6 +19,10 @@ images: [] ## 通用问题 +### 能否纯本地允许 + +可以。需要准备好向量模型和LLM模型。 + ### insufficient_user_quota user quota is not enough OneAPI 账号的余额不足,默认 root 用户只有 200 刀,可以手动修改。 @@ -105,7 +109,7 @@ mongo连接失败,检查 ### TypeError: Cannot read properties of null (reading 'useMemo' ) -用 Node18 试试,可能最新的 Node 有问题。 本地开发流程: +删除所有的`node_modules`,用 Node18 重新 install 试试,可能最新的 Node 有问题。 本地开发流程: 1. 根目录: `pnpm i` 2. 复制 `config.json` -> `config.local.json` diff --git a/packages/global/common/file/icon.ts b/packages/global/common/file/icon.ts index 9e1444f641d..d687383ca04 100644 --- a/packages/global/common/file/icon.ts +++ b/packages/global/common/file/icon.ts @@ -3,10 +3,12 @@ export const fileImgs = [ { suffix: 'csv', src: 'file/fill/csv' }, { suffix: '(doc|docs)', src: 'file/fill/doc' }, { suffix: 'txt', src: 'file/fill/txt' }, - { suffix: 'md', src: 'file/fill/markdown' } + { suffix: 'md', src: 'file/fill/markdown' }, + { suffix: 'html', src: 'file/fill/html' } + // { suffix: '.', src: '/imgs/files/file.svg' } ]; -export function getFileIcon(name = '', defaultImg = '/imgs/files/file.svg') { +export function getFileIcon(name = '', defaultImg = 'file/fill/file') { return fileImgs.find((item) => new RegExp(item.suffix, 'gi').test(name))?.src || defaultImg; } diff --git a/packages/global/common/string/markdown.ts b/packages/global/common/string/markdown.ts index 32299f2ddfc..5c2116d4f42 100644 --- a/packages/global/common/string/markdown.ts +++ b/packages/global/common/string/markdown.ts @@ -51,19 +51,18 @@ export const uploadMarkdownBase64 = async ({ // match base64, upload and replace it const base64Regex = /data:image\/.*;base64,([^\)]+)/g; const base64Arr = rawText.match(base64Regex) || []; + // upload base64 and replace it - await Promise.all( - base64Arr.map(async (base64Img) => { - try { - const str = await uploadImgController(base64Img); + for await (const base64Img of base64Arr) { + try { + const str = await uploadImgController(base64Img); - rawText = rawText.replace(base64Img, str); - } catch (error) { - rawText = rawText.replace(base64Img, ''); - rawText = rawText.replace(/!\[.*\]\(\)/g, ''); - } - }) - ); + rawText = rawText.replace(base64Img, str); + } catch (error) { + rawText = rawText.replace(base64Img, ''); + rawText = rawText.replace(/!\[.*\]\(\)/g, ''); + } + } } // Remove white space on both sides of the picture diff --git a/packages/global/core/dataset/api.d.ts b/packages/global/core/dataset/api.d.ts index 43a15f75a62..3c487db2b4a 100644 --- a/packages/global/core/dataset/api.d.ts +++ b/packages/global/core/dataset/api.d.ts @@ -48,10 +48,6 @@ export type FileCreateDatasetCollectionParams = ApiCreateDatasetCollectionParams name: string; rawTextLength: number; hashRawText: string; - trainingType: `${TrainingModeEnum}`; - chunkSize: number; - chunkSplitter: string; - qaPrompt: string; fileMetadata?: Record; collectionMetadata?: Record; @@ -74,3 +70,14 @@ export type PostWebsiteSyncParams = { datasetId: string; billId: string; }; + +export type PushDatasetDataProps = { + collectionId: string; + data: PushDatasetDataChunkProps[]; + trainingMode: `${TrainingModeEnum}`; + prompt?: string; + billId?: string; +}; +export type PushDatasetDataResponse = { + insertLen: number; +}; diff --git a/packages/global/core/dataset/controller.d.ts b/packages/global/core/dataset/controller.d.ts index 99867174477..732f4cf78f4 100644 --- a/packages/global/core/dataset/controller.d.ts +++ b/packages/global/core/dataset/controller.d.ts @@ -21,7 +21,7 @@ export type UpdateDatasetDataProps = { }; export type PatchIndexesProps = { - type: 'create' | 'update' | 'delete'; + type: 'create' | 'update' | 'delete' | 'unChange'; index: Omit & { dataId?: string; }; diff --git a/packages/service/common/file/image/controller.ts b/packages/service/common/file/image/controller.ts index ecb0796667f..cc98acf8971 100644 --- a/packages/service/common/file/image/controller.ts +++ b/packages/service/common/file/image/controller.ts @@ -46,8 +46,17 @@ export async function readMongoImg({ id }: { id: string }) { return data?.binary; } -export async function delImgByRelatedId(relateIds: string[]) { +export async function delImgByRelatedId({ + teamId, + relateIds +}: { + teamId: string; + relateIds: string[]; +}) { + if (relateIds.length === 0) return; + return MongoImage.deleteMany({ + teamId, 'metadata.relatedId': { $in: relateIds.map((id) => String(id)) } }); } diff --git a/packages/service/common/file/image/schema.ts b/packages/service/common/file/image/schema.ts index baa799ac975..2eb2593d341 100644 --- a/packages/service/common/file/image/schema.ts +++ b/packages/service/common/file/image/schema.ts @@ -34,9 +34,8 @@ const ImageSchema = new Schema({ try { ImageSchema.index({ expiredTime: 1 }, { expireAfterSeconds: 60 }); ImageSchema.index({ type: 1 }); - ImageSchema.index({ teamId: 1 }); ImageSchema.index({ createTime: 1 }); - ImageSchema.index({ 'metadata.relatedId': 1 }); + ImageSchema.index({ teamId: 1, 'metadata.relatedId': 1 }); } catch (error) { console.log(error); } diff --git a/packages/service/common/file/multer.ts b/packages/service/common/file/multer.ts index 90f37892344..27788987c11 100644 --- a/packages/service/common/file/multer.ts +++ b/packages/service/common/file/multer.ts @@ -28,12 +28,16 @@ export const getUploadModel = ({ maxSize = 500 }: { maxSize?: number }) => { // }, filename: async (req, file, cb) => { const { ext } = path.parse(decodeURIComponent(file.originalname)); - cb(null, `${getNanoid(32)}${ext}`); + cb(null, `${getNanoid()}${ext}`); } }) }).single('file'); - async doUpload>(req: NextApiRequest, res: NextApiResponse) { + async doUpload>( + req: NextApiRequest, + res: NextApiResponse, + originBuckerName?: `${BucketNameEnum}` + ) { return new Promise<{ file: FileType; metadata: Record; @@ -47,7 +51,7 @@ export const getUploadModel = ({ maxSize = 500 }: { maxSize?: number }) => { } // check bucket name - const bucketName = req.body?.bucketName as `${BucketNameEnum}`; + const bucketName = (req.body?.bucketName || originBuckerName) as `${BucketNameEnum}`; if (bucketName && !bucketNameMap[bucketName]) { return reject('BucketName is invalid'); } diff --git a/packages/service/common/vectorStore/pg/controller.ts b/packages/service/common/vectorStore/pg/controller.ts index f91da1efbc9..ad10b995902 100644 --- a/packages/service/common/vectorStore/pg/controller.ts +++ b/packages/service/common/vectorStore/pg/controller.ts @@ -39,14 +39,15 @@ export const insertDatasetDataVector = async ( } ): Promise<{ insertId: string }> => { const { teamId, datasetId, collectionId, vectors, retry = 3 } = props; + try { const { rows } = await PgClient.insert(PgDatasetTableName, { values: [ [ { key: 'vector', value: `[${vectors[0]}]` }, { key: 'team_id', value: String(teamId) }, - { key: 'dataset_id', value: datasetId }, - { key: 'collection_id', value: collectionId } + { key: 'dataset_id', value: String(datasetId) }, + { key: 'collection_id', value: String(collectionId) } ] ] }); @@ -176,8 +177,8 @@ export const getVectorDataByTime = async (start: Date, end: Date) => { `); return rows.map((item) => ({ - id: item.id, - datasetId: item.dataset_id, - teamId: item.team_id + id: String(item.id), + teamId: item.team_id, + datasetId: item.dataset_id })); }; diff --git a/packages/service/core/chat/chatItemSchema.ts b/packages/service/core/chat/chatItemSchema.ts index bf71c8d65d0..5104768ca43 100644 --- a/packages/service/core/chat/chatItemSchema.ts +++ b/packages/service/core/chat/chatItemSchema.ts @@ -89,6 +89,7 @@ try { close custom feedback; */ ChatItemSchema.index({ appId: 1, chatId: 1, dataId: 1 }, { background: true }); + ChatItemSchema.index({ time: -1 }, { background: true }); ChatItemSchema.index({ userGoodFeedback: 1 }, { background: true }); ChatItemSchema.index({ userBadFeedback: 1 }, { background: true }); ChatItemSchema.index({ customFeedbacks: 1 }, { background: true }); diff --git a/packages/service/core/dataset/collection/controller.ts b/packages/service/core/dataset/collection/controller.ts index 4a609088fb2..eede16413d1 100644 --- a/packages/service/core/dataset/collection/controller.ts +++ b/packages/service/core/dataset/collection/controller.ts @@ -25,7 +25,7 @@ export async function createOneCollection({ type, trainingType = TrainingModeEnum.chunk, - chunkSize = 0, + chunkSize = 512, chunkSplitter, qaPrompt, @@ -134,7 +134,10 @@ export async function delCollectionAndRelatedSources({ // delete file and imgs await Promise.all([ - delImgByRelatedId(relatedImageIds), + delImgByRelatedId({ + teamId, + relateIds: relatedImageIds + }), delFileByFileIdList({ bucketName: BucketNameEnum.dataset, fileIdList diff --git a/packages/service/core/dataset/training/controller.ts b/packages/service/core/dataset/training/controller.ts index 298b3d1c1b2..222be48a5d0 100644 --- a/packages/service/core/dataset/training/controller.ts +++ b/packages/service/core/dataset/training/controller.ts @@ -1,5 +1,15 @@ import { delay } from '@fastgpt/global/common/system/utils'; import { MongoDatasetTraining } from './schema'; +import type { + PushDatasetDataChunkProps, + PushDatasetDataProps, + PushDatasetDataResponse +} from '@fastgpt/global/core/dataset/api.d'; +import { getCollectionWithDataset } from '../controller'; +import { TrainingModeEnum } from '@fastgpt/global/core/dataset/constants'; +import { simpleText } from '@fastgpt/global/common/string/tools'; +import { countPromptTokens } from '@fastgpt/global/common/string/tiktoken'; +import type { VectorModelItemType, LLMModelItemType } from '@fastgpt/global/core/ai/model.d'; export const lockTrainingDataByTeamId = async (teamId: string, retry = 3): Promise => { try { @@ -19,3 +29,165 @@ export const lockTrainingDataByTeamId = async (teamId: string, retry = 3): Promi return Promise.reject(error); } }; + +export async function pushDataListToTrainingQueue({ + teamId, + tmbId, + collectionId, + data, + prompt, + billId, + trainingMode = TrainingModeEnum.chunk, + + vectorModelList = [], + qaModelList = [] +}: { + teamId: string; + tmbId: string; + vectorModelList: VectorModelItemType[]; + qaModelList: LLMModelItemType[]; +} & PushDatasetDataProps): Promise { + const { + datasetId: { _id: datasetId, vectorModel, agentModel } + } = await getCollectionWithDataset(collectionId); + + const checkModelValid = async ({ collectionId }: { collectionId: string }) => { + if (!collectionId) return Promise.reject(`CollectionId is empty`); + + if (trainingMode === TrainingModeEnum.chunk) { + const vectorModelData = vectorModelList?.find((item) => item.model === vectorModel); + if (!vectorModelData) { + return Promise.reject(`Model ${vectorModel} is inValid`); + } + + return { + maxToken: vectorModelData.maxToken * 1.5, + model: vectorModelData.model, + weight: vectorModelData.weight + }; + } + + if (trainingMode === TrainingModeEnum.qa) { + const qaModelData = qaModelList?.find((item) => item.model === agentModel); + if (!qaModelData) { + return Promise.reject(`Model ${agentModel} is inValid`); + } + return { + maxToken: qaModelData.maxContext * 0.8, + model: qaModelData.model, + weight: 0 + }; + } + return Promise.reject(`Training mode "${trainingMode}" is inValid`); + }; + + const { model, maxToken, weight } = await checkModelValid({ + collectionId + }); + + // format q and a, remove empty char + data.forEach((item) => { + item.q = simpleText(item.q); + item.a = simpleText(item.a); + + item.indexes = item.indexes + ?.map((index) => { + return { + ...index, + text: simpleText(index.text) + }; + }) + .filter(Boolean); + }); + + // filter repeat or equal content + const set = new Set(); + const filterResult: Record = { + success: [], + overToken: [], + repeat: [], + error: [] + }; + + // filter repeat content + data.forEach((item) => { + if (!item.q) { + filterResult.error.push(item); + return; + } + + const text = item.q + item.a; + + // count q token + const token = countPromptTokens(item.q); + + if (token > maxToken) { + filterResult.overToken.push(item); + return; + } + + if (set.has(text)) { + console.log('repeat', item); + filterResult.repeat.push(item); + } else { + filterResult.success.push(item); + set.add(text); + } + }); + + // insert data to db + const insertData = async (dataList: PushDatasetDataChunkProps[], retry = 3): Promise => { + try { + const results = await MongoDatasetTraining.insertMany( + dataList.map((item, i) => ({ + teamId, + tmbId, + datasetId, + collectionId, + billId, + mode: trainingMode, + prompt, + model, + q: item.q, + a: item.a, + chunkIndex: item.chunkIndex ?? i, + weight: weight ?? 0, + indexes: item.indexes + })) + ); + await delay(500); + return results.length; + } catch (error) { + if (retry > 0) { + await delay(500); + return insertData(dataList, retry - 1); + } + return Promise.reject(error); + } + }; + + let insertLen = 0; + const chunkSize = 50; + const chunkList = filterResult.success.reduce( + (acc, cur) => { + const lastChunk = acc[acc.length - 1]; + if (lastChunk.length < chunkSize) { + lastChunk.push(cur); + } else { + acc.push([cur]); + } + return acc; + }, + [[]] as PushDatasetDataChunkProps[][] + ); + for await (const chunks of chunkList) { + insertLen += await insertData(chunks); + } + + delete filterResult.success; + + return { + insertLen, + ...filterResult + }; +} diff --git a/packages/service/support/wallet/bill/schema.ts b/packages/service/support/wallet/bill/schema.ts index b6795d0bb59..25fed021606 100644 --- a/packages/service/support/wallet/bill/schema.ts +++ b/packages/service/support/wallet/bill/schema.ts @@ -52,7 +52,7 @@ const BillSchema = new Schema({ }); try { - BillSchema.index({ teamId: 1, tmbId: 1, time: -1 }); + BillSchema.index({ teamId: 1, time: -1 }); BillSchema.index({ time: 1 }, { expireAfterSeconds: 180 * 24 * 60 * 60 }); } catch (error) { console.log(error); diff --git a/packages/web/common/file/read/csv.ts b/packages/web/common/file/read/csv.ts new file mode 100644 index 00000000000..783a7309ff8 --- /dev/null +++ b/packages/web/common/file/read/csv.ts @@ -0,0 +1,40 @@ +import Papa from 'papaparse'; +import { readFileRawText } from './rawText'; + +/** + * read csv to json + * @response { + * header: string[], + * data: string[][] + * } + */ +export const readCsvContent = async ({ file }: { file: File }) => { + try { + const { rawText: textArr } = await readFileRawText(file); + const csvArr = Papa.parse(textArr).data as string[][]; + if (csvArr.length === 0) { + throw new Error('csv 解析失败'); + } + + const header = csvArr.shift() as string[]; + + // add title to data + const rawText = csvArr + .map((item) => + item.map((value, index) => { + if (!header[index]) return value; + return `${header[index]}: ${value}`; + }) + ) + .flat() + .join('\n'); + + return { + rawText, + header, + data: csvArr.map((item) => item) + }; + } catch (error) { + return Promise.reject('解析 csv 文件失败'); + } +}; diff --git a/packages/web/common/file/read/index.ts b/packages/web/common/file/read/index.ts index d5e4c6ad7d4..d1434210850 100644 --- a/packages/web/common/file/read/index.ts +++ b/packages/web/common/file/read/index.ts @@ -1,4 +1,5 @@ import { loadFile2Buffer } from '../utils'; +import { readCsvContent } from './csv'; import { readHtmlFile } from './html'; import { readMdFile } from './md'; import { readPdfFile } from './pdf'; @@ -29,6 +30,8 @@ export const readFileRawContent = async ({ file, uploadImgController: uploadBase64Controller }); + case 'csv': + return readCsvContent({ file }); case 'pdf': const pdf = await loadFile2Buffer({ file }); return readPdfFile({ pdf }); diff --git a/packages/web/components/common/Textarea/JsonEditor/index.tsx b/packages/web/components/common/Textarea/JsonEditor/index.tsx index 5f04b05a358..7185e662e02 100644 --- a/packages/web/components/common/Textarea/JsonEditor/index.tsx +++ b/packages/web/components/common/Textarea/JsonEditor/index.tsx @@ -74,7 +74,7 @@ const JSONEditor = ({ defaultValue, value, onChange, resize, ...props }: Props) { style = {} } = props; - const [labelX, labelY] = getBezierPath({ + const [, labelX, labelY] = getBezierPath({ sourceX, sourceY, sourcePosition, diff --git a/projects/app/src/global/core/api/datasetRes.d.ts b/projects/app/src/global/core/api/datasetRes.d.ts index ceac0965d79..5f25fb93db9 100644 --- a/projects/app/src/global/core/api/datasetRes.d.ts +++ b/projects/app/src/global/core/api/datasetRes.d.ts @@ -8,6 +8,3 @@ import { DatasetCollectionSchemaType } from '@fastgpt/global/core/dataset/type'; /* ======= collection =========== */ /* ==== data ===== */ -export type PushDataResponse = { - insertLen: number; -}; diff --git a/projects/app/src/global/core/dataset/api.d.ts b/projects/app/src/global/core/dataset/api.d.ts index e942fe63781..5b0c5792496 100644 --- a/projects/app/src/global/core/dataset/api.d.ts +++ b/projects/app/src/global/core/dataset/api.d.ts @@ -27,13 +27,7 @@ export type CreateDatasetParams = { export type InsertOneDatasetDataProps = PushDatasetDataChunkProps & { collectionId: string; }; -export type PushDatasetDataProps = { - collectionId: string; - data: PushDatasetDataChunkProps[]; - trainingMode: `${TrainingModeEnum}`; - prompt?: string; - billId?: string; -}; + export type UpdateDatasetDataProps = { id: string; q?: string; // embedding content diff --git a/projects/app/src/pages/api/core/dataset/collection/create/text.ts b/projects/app/src/pages/api/core/dataset/collection/create/text.ts index 59f8267fd2c..06beda81a8b 100644 --- a/projects/app/src/pages/api/core/dataset/collection/create/text.ts +++ b/projects/app/src/pages/api/core/dataset/collection/create/text.ts @@ -16,11 +16,15 @@ import { checkDatasetLimit } from '@fastgpt/service/support/permission/limit/dat import { predictDataLimitLength } from '@fastgpt/global/core/dataset/utils'; import { pushDataToTrainingQueue } from '@/service/core/dataset/data/controller'; import { hashStr } from '@fastgpt/global/common/string/tools'; +import { createTrainingBill } from '@fastgpt/service/support/wallet/bill/controller'; +import { BillSourceEnum } from '@fastgpt/global/support/wallet/bill/constants'; +import { getQAModel, getVectorModel } from '@/service/core/ai/model'; export default async function handler(req: NextApiRequest, res: NextApiResponse) { try { await connectToDatabase(); const { + name, text, trainingType = TrainingModeEnum.chunk, chunkSize = 512, @@ -29,7 +33,7 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse< ...body } = req.body as TextCreateDatasetCollectionParams; - const { teamId, tmbId } = await authDataset({ + const { teamId, tmbId, dataset } = await authDataset({ req, authToken: true, authApiKey: true, @@ -52,21 +56,32 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse< insertLen: predictDataLimitLength(trainingType, chunks) }); - // 3. create collection - const collectionId = await createOneCollection({ - ...body, - teamId, - tmbId, - type: DatasetCollectionTypeEnum.virtual, + // 3. create collection and training bill + const [collectionId, { billId }] = await Promise.all([ + createOneCollection({ + ...body, + teamId, + tmbId, + type: DatasetCollectionTypeEnum.virtual, - trainingType, - chunkSize, - chunkSplitter, - qaPrompt, + name, + trainingType, + chunkSize, + chunkSplitter, + qaPrompt, - hashRawText: hashStr(text), - rawTextLength: text.length - }); + hashRawText: hashStr(text), + rawTextLength: text.length + }), + createTrainingBill({ + teamId, + tmbId, + appName: name, + billSource: BillSourceEnum.training, + vectorModel: getVectorModel(dataset.vectorModel)?.name, + agentModel: getQAModel(dataset.agentModel)?.name + }) + ]); // 4. push chunks to training queue const insertResults = await pushDataToTrainingQueue({ @@ -74,6 +89,8 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse< tmbId, collectionId, trainingMode: trainingType, + prompt: qaPrompt, + billId, data: chunks.map((text, index) => ({ q: text, chunkIndex: index @@ -90,3 +107,11 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse< }); } } + +export const config = { + api: { + bodyParser: { + sizeLimit: '10mb' + } + } +}; diff --git a/projects/app/src/pages/api/core/dataset/data/pushData.ts b/projects/app/src/pages/api/core/dataset/data/pushData.ts index 146a21e52b4..d850df695c1 100644 --- a/projects/app/src/pages/api/core/dataset/data/pushData.ts +++ b/projects/app/src/pages/api/core/dataset/data/pushData.ts @@ -3,8 +3,10 @@ import type { NextApiRequest, NextApiResponse } from 'next'; import { jsonRes } from '@fastgpt/service/common/response'; import { connectToDatabase } from '@/service/mongo'; import { withNextCors } from '@fastgpt/service/common/middle/cors'; -import type { PushDataResponse } from '@/global/core/api/datasetRes.d'; -import type { PushDatasetDataProps } from '@/global/core/dataset/api.d'; +import type { + PushDatasetDataProps, + PushDatasetDataResponse +} from '@fastgpt/global/core/dataset/api.d'; import { authDatasetCollection } from '@fastgpt/service/support/permission/auth/dataset'; import { checkDatasetLimit } from '@fastgpt/service/support/permission/limit/dataset'; import { predictDataLimitLength } from '@fastgpt/global/core/dataset/utils'; @@ -39,7 +41,7 @@ export default withNextCors(async function handler(req: NextApiRequest, res: Nex insertLen: predictDataLimitLength(collection.trainingType, data) }); - jsonRes(res, { + jsonRes(res, { data: await pushDataToTrainingQueue({ ...req.body, teamId, diff --git a/projects/app/src/pages/api/plusApi/[...path].ts b/projects/app/src/pages/api/plusApi/[...path].ts index 761fc3ba795..3e7b552167a 100644 --- a/projects/app/src/pages/api/plusApi/[...path].ts +++ b/projects/app/src/pages/api/plusApi/[...path].ts @@ -12,16 +12,13 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse) const method = (req.method || 'POST') as Method; const { path = [], ...query } = req.query as any; - const url = `/${path?.join('/')}`; + const url = `/${path?.join('/')}?${new URLSearchParams(query).toString()}`; if (!url) { throw new Error('url is empty'); } - const data = { - ...req.body, - ...query - }; + const data = req.body || query; const repose = await request( url, @@ -56,3 +53,12 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse) }); } } + +export const config = { + api: { + bodyParser: { + sizeLimit: '10mb' + }, + responseLimit: '10mb' + } +}; diff --git a/projects/app/src/pages/dataset/detail/components/Import/diffSource/FileLocal.tsx b/projects/app/src/pages/dataset/detail/components/Import/diffSource/FileLocal.tsx index 8916deaf1ef..5c388f102b2 100644 --- a/projects/app/src/pages/dataset/detail/components/Import/diffSource/FileLocal.tsx +++ b/projects/app/src/pages/dataset/detail/components/Import/diffSource/FileLocal.tsx @@ -27,7 +27,7 @@ const Upload = dynamic(() => import('../commonProgress/Upload')); const PreviewRawText = dynamic(() => import('../components/PreviewRawText')); type FileItemType = ImportSourceItemType & { file: File }; -const fileType = '.txt, .docx, .pdf, .md, .html'; +const fileType = '.txt, .docx, .csv, .pdf, .md, .html'; const maxSelectFileCount = 1000; const FileLocal = ({ activeStep, goToNext }: ImportDataComponentProps) => { diff --git a/projects/app/src/pages/dataset/detail/components/Import/diffSource/TableLocal.tsx b/projects/app/src/pages/dataset/detail/components/Import/diffSource/TableLocal.tsx index dbf84f752a1..5db5d1567a5 100644 --- a/projects/app/src/pages/dataset/detail/components/Import/diffSource/TableLocal.tsx +++ b/projects/app/src/pages/dataset/detail/components/Import/diffSource/TableLocal.tsx @@ -14,7 +14,8 @@ import { useImportStore } from '../Provider'; import { feConfigs } from '@/web/common/system/staticData'; import dynamic from 'next/dynamic'; -import { fileDownload, readCsvContent } from '@/web/common/file/utils'; +import { fileDownload } from '@/web/common/file/utils'; +import { readCsvContent } from '@fastgpt/web/common/file/read/csv'; const PreviewData = dynamic(() => import('../commonProgress/PreviewData')); const Upload = dynamic(() => import('../commonProgress/Upload')); @@ -56,7 +57,7 @@ const SelectFile = React.memo(function SelectFile({ goToNext }: { goToNext: () = { for await (const selectFile of files) { const { file, folderPath } = selectFile; - const { header, data } = await readCsvContent(file); + const { header, data } = await readCsvContent({ file }); const filterData: FileItemType['chunks'] = data .filter((item) => item[0]) diff --git a/projects/app/src/pages/dataset/detail/components/InputDataModal.tsx b/projects/app/src/pages/dataset/detail/components/InputDataModal.tsx index b793034825f..27ffdd77258 100644 --- a/projects/app/src/pages/dataset/detail/components/InputDataModal.tsx +++ b/projects/app/src/pages/dataset/detail/components/InputDataModal.tsx @@ -193,7 +193,10 @@ const InputDataModal = ({ // not exactly same await putDatasetDataById({ id: dataId, - ...e + ...e, + indexes: e.indexes.map((index) => + index.defaultIndex ? getDefaultIndex({ q: e.q, a: e.a }) : index + ) }); return { diff --git a/projects/app/src/pages/dataset/detail/components/Test.tsx b/projects/app/src/pages/dataset/detail/components/Test.tsx index 009b28b223a..20a6acf5fe4 100644 --- a/projects/app/src/pages/dataset/detail/components/Test.tsx +++ b/projects/app/src/pages/dataset/detail/components/Test.tsx @@ -35,7 +35,8 @@ import dynamic from 'next/dynamic'; import { useForm } from 'react-hook-form'; import MySelect from '@/components/Select'; import { useSelectFile } from '@/web/common/file/hooks/useSelectFile'; -import { fileDownload, readCsvContent } from '@/web/common/file/utils'; +import { fileDownload } from '@/web/common/file/utils'; +import { readCsvContent } from '@fastgpt/web/common/file/read/csv'; import { delay } from '@fastgpt/global/common/system/utils'; import QuoteItem from '@/components/core/dataset/QuoteItem'; @@ -125,7 +126,7 @@ const Test = ({ datasetId }: { datasetId: string }) => { const { mutate: onFileTest, isLoading: fileTestIsLoading } = useRequest({ mutationFn: async ({ searchParams }: FormType) => { if (!selectFile) return Promise.reject('File is not selected'); - const { data } = await readCsvContent(selectFile); + const { data } = await readCsvContent({ file: selectFile }); const testList = data.slice(0, 100); const results: SearchTestResponse[] = []; diff --git a/projects/app/src/service/common/system/cron.ts b/projects/app/src/service/common/system/cron.ts index 33434c50576..8f837f40d7c 100644 --- a/projects/app/src/service/common/system/cron.ts +++ b/projects/app/src/service/common/system/cron.ts @@ -3,6 +3,11 @@ import { generateQA } from '@/service/events/generateQA'; import { generateVector } from '@/service/events/generateVector'; import { setCron } from '@fastgpt/service/common/system/cron'; +export const startCron = () => { + setUpdateSystemConfigCron(); + setTrainingQueueCron(); +}; + export const setUpdateSystemConfigCron = () => { setCron('*/5 * * * *', () => { initSystemConfig(); @@ -11,7 +16,7 @@ export const setUpdateSystemConfigCron = () => { }; export const setTrainingQueueCron = () => { - setCron('*/3 * * * *', () => { + setCron('*/1 * * * *', () => { generateVector(); generateQA(); }); diff --git a/projects/app/src/service/core/dataset/data/controller.ts b/projects/app/src/service/core/dataset/data/controller.ts index b2094f93a16..db10f878ceb 100644 --- a/projects/app/src/service/core/dataset/data/controller.ts +++ b/projects/app/src/service/core/dataset/data/controller.ts @@ -9,13 +9,11 @@ import { recallFromVectorStore, updateDatasetDataVector } from '@fastgpt/service/common/vectorStore/controller'; -import { Types } from 'mongoose'; import { DatasetDataIndexTypeEnum, DatasetSearchModeEnum, DatasetSearchModeMap, - SearchScoreTypeEnum, - TrainingModeEnum + SearchScoreTypeEnum } from '@fastgpt/global/core/dataset/constants'; import { getDefaultIndex } from '@fastgpt/global/core/dataset/utils'; import { jiebaSplit } from '@/service/common/string/jieba'; @@ -29,172 +27,26 @@ import { } from '@fastgpt/global/core/dataset/type'; import { reRankRecall } from '../../ai/rerank'; import { countPromptTokens } from '@fastgpt/global/common/string/tiktoken'; -import { hashStr, simpleText } from '@fastgpt/global/common/string/tools'; -import type { PushDatasetDataProps } from '@/global/core/dataset/api.d'; -import type { PushDataResponse } from '@/global/core/api/datasetRes'; -import { PushDatasetDataChunkProps } from '@fastgpt/global/core/dataset/api'; -import { MongoDatasetTraining } from '@fastgpt/service/core/dataset/training/schema'; -import { startQueue } from '@/service/utils/tools'; -import { getCollectionWithDataset } from '@fastgpt/service/core/dataset/controller'; -import { getQAModel, getVectorModel } from '../../ai/model'; -import { delay } from '@fastgpt/global/common/system/utils'; - -export async function pushDataToTrainingQueue({ - teamId, - tmbId, - collectionId, - data, - prompt, - billId, - trainingMode -}: { - teamId: string; - tmbId: string; -} & PushDatasetDataProps): Promise { - const checkModelValid = async ({ collectionId }: { collectionId: string }) => { - const { - datasetId: { _id: datasetId, vectorModel, agentModel } - } = await getCollectionWithDataset(collectionId); - - if (trainingMode === TrainingModeEnum.chunk) { - if (!collectionId) return Promise.reject(`CollectionId is empty`); - const vectorModelData = getVectorModel(vectorModel); - if (!vectorModelData) { - return Promise.reject(`Model ${vectorModel} is inValid`); - } - - return { - datasetId, - maxToken: vectorModelData.maxToken * 1.5, - model: vectorModelData.model, - weight: vectorModelData.weight - }; - } - - if (trainingMode === TrainingModeEnum.qa) { - const qaModelData = getQAModel(agentModel); - if (!qaModelData) { - return Promise.reject(`Model ${agentModel} is inValid`); - } - return { - datasetId, - maxToken: qaModelData.maxContext * 0.8, - model: qaModelData.model, - weight: 0 - }; - } - return Promise.reject(`Mode ${trainingMode} is inValid`); - }; - - const { datasetId, model, maxToken, weight } = await checkModelValid({ - collectionId - }); - - // format q and a, remove empty char - data.forEach((item) => { - item.q = simpleText(item.q); - item.a = simpleText(item.a); - - item.indexes = item.indexes - ?.map((index) => { - return { - ...index, - text: simpleText(index.text) - }; - }) - .filter(Boolean); - }); - - // filter repeat or equal content - const set = new Set(); - const filterResult: Record = { - success: [], - overToken: [], - repeat: [], - error: [] - }; - - data.forEach((item) => { - if (!item.q) { - filterResult.error.push(item); - return; - } - - const text = item.q + item.a; - - // count q token - const token = countPromptTokens(item.q); - - if (token > maxToken) { - filterResult.overToken.push(item); - return; - } - - if (set.has(text)) { - console.log('repeat', item); - filterResult.repeat.push(item); - } else { - filterResult.success.push(item); - set.add(text); - } +import { hashStr } from '@fastgpt/global/common/string/tools'; +import type { + PushDatasetDataProps, + PushDatasetDataResponse +} from '@fastgpt/global/core/dataset/api.d'; +import { pushDataListToTrainingQueue } from '@fastgpt/service/core/dataset/training/controller'; + +export async function pushDataToTrainingQueue( + props: { + teamId: string; + tmbId: string; + } & PushDatasetDataProps +): Promise { + const result = await pushDataListToTrainingQueue({ + ...props, + vectorModelList: global.vectorModels, + qaModelList: global.qaModels }); - // 插入记录 - const insertData = async (dataList: PushDatasetDataChunkProps[], retry = 3): Promise => { - try { - const results = await MongoDatasetTraining.insertMany( - dataList.map((item, i) => ({ - teamId, - tmbId, - datasetId, - collectionId, - billId, - mode: trainingMode, - prompt, - model, - q: item.q, - a: item.a, - chunkIndex: item.chunkIndex ?? i, - weight: weight ?? 0, - indexes: item.indexes - })) - ); - await delay(500); - return results.length; - } catch (error) { - if (retry > 0) { - await delay(1000); - return insertData(dataList, retry - 1); - } - return Promise.reject(error); - } - }; - - let insertLen = 0; - const chunkSize = 50; - const chunkList = filterResult.success.reduce( - (acc, cur) => { - const lastChunk = acc[acc.length - 1]; - if (lastChunk.length < chunkSize) { - lastChunk.push(cur); - } else { - acc.push([cur]); - } - return acc; - }, - [[]] as PushDatasetDataChunkProps[][] - ); - for await (const chunks of chunkList) { - insertLen += await insertData(chunks); - } - - startQueue(); - delete filterResult.success; - - return { - insertLen, - ...filterResult - }; + return result; } /* insert data. @@ -341,6 +193,11 @@ export async function updateData2Dataset({ text: qaStr } }); + } else { + patchResult.push({ + type: 'unChange', + index: item + }); } } else { // not in database, create @@ -379,6 +236,7 @@ export async function updateData2Dataset({ model }); item.index.dataId = result.insertId; + return result; } if (item.type === 'delete' && item.index.dataId) { @@ -397,13 +255,14 @@ export async function updateData2Dataset({ ); const charsLength = result.reduce((acc, cur) => acc + cur.charsLength, 0); + const newIndexes = patchResult.filter((item) => item.type !== 'delete').map((item) => item.index); // update mongo other data mongoData.q = q || mongoData.q; mongoData.a = a ?? mongoData.a; mongoData.fullTextToken = jiebaSplit({ text: mongoData.q + mongoData.a }); // @ts-ignore - mongoData.indexes = indexes; + mongoData.indexes = newIndexes; await mongoData.save(); return { diff --git a/projects/app/src/service/mongo.ts b/projects/app/src/service/mongo.ts index 96c06387e5d..134570f6c55 100644 --- a/projects/app/src/service/mongo.ts +++ b/projects/app/src/service/mongo.ts @@ -7,7 +7,7 @@ import { createDefaultTeam } from '@fastgpt/service/support/user/team/controller import { exit } from 'process'; import { initVectorStore } from '@fastgpt/service/common/vectorStore/controller'; import { getInitConfig } from '@/pages/api/common/system/getInitData'; -import { setUpdateSystemConfigCron, setTrainingQueueCron } from './common/system/cron'; +import { startCron } from './common/system/cron'; /** * connect MongoDB and init data @@ -23,8 +23,7 @@ export function connectToDatabase(): Promise { getInitConfig(); // cron - setUpdateSystemConfigCron(); - setTrainingQueueCron(); + startCron(); initRootUser(); } diff --git a/projects/app/src/web/common/file/controller.ts b/projects/app/src/web/common/file/controller.ts index 3978e78c31a..9c371faf176 100644 --- a/projects/app/src/web/common/file/controller.ts +++ b/projects/app/src/web/common/file/controller.ts @@ -32,13 +32,24 @@ export const uploadFiles = ({ }); }; -export const getUploadBase64ImgController = (props: CompressImgProps & UploadImgProps) => - compressBase64ImgAndUpload({ - maxW: 4000, - maxH: 4000, - maxSize: 1024 * 1024 * 5, - ...props - }); +export const getUploadBase64ImgController = ( + props: CompressImgProps & UploadImgProps, + retry = 3 +): Promise => { + try { + return compressBase64ImgAndUpload({ + maxW: 4000, + maxH: 4000, + maxSize: 1024 * 1024 * 5, + ...props + }); + } catch (error) { + if (retry > 0) { + return getUploadBase64ImgController(props, retry - 1); + } + return Promise.reject(error); + } +}; /** * compress image. response base64 diff --git a/projects/app/src/web/common/file/utils.ts b/projects/app/src/web/common/file/utils.ts index 23dcafc8fc1..160ad29ea3a 100644 --- a/projects/app/src/web/common/file/utils.ts +++ b/projects/app/src/web/common/file/utils.ts @@ -1,29 +1,3 @@ -import Papa from 'papaparse'; -import { readFileRawText } from '@fastgpt/web/common/file/read/rawText'; - -/** - * read csv to json - * @response { - * header: string[], - * data: string[][] - * } - */ -export const readCsvContent = async (file: File) => { - try { - const { rawText: textArr } = await readFileRawText(file); - const csvArr = Papa.parse(textArr).data as string[][]; - if (csvArr.length === 0) { - throw new Error('csv 解析失败'); - } - return { - header: csvArr.shift() as string[], - data: csvArr.map((item) => item) - }; - } catch (error) { - return Promise.reject('解析 csv 文件失败'); - } -}; - /** * file download by text */ diff --git a/projects/app/src/web/core/dataset/api.ts b/projects/app/src/web/core/dataset/api.ts index 15aa1769031..bb8cf2d2f66 100644 --- a/projects/app/src/web/core/dataset/api.ts +++ b/projects/app/src/web/core/dataset/api.ts @@ -19,12 +19,14 @@ import type { SearchTestResponse } from '@/global/core/dataset/api.d'; import type { - PushDatasetDataProps, UpdateDatasetDataProps, CreateDatasetParams, InsertOneDatasetDataProps } from '@/global/core/dataset/api.d'; -import type { PushDataResponse } from '@/global/core/api/datasetRes.d'; +import type { + PushDatasetDataProps, + PushDatasetDataResponse +} from '@fastgpt/global/core/dataset/api.d'; import type { DatasetCollectionItemType } from '@fastgpt/global/core/dataset/type'; import { DatasetCollectionSyncResultEnum, @@ -97,7 +99,7 @@ export const getDatasetDataItemById = (id: string) => * push data to training queue */ export const postChunks2Dataset = (data: PushDatasetDataProps) => - POST(`/core/dataset/data/pushData`, data); + POST(`/core/dataset/data/pushData`, data); /** * insert one data to dataset (immediately insert)