Skip to content

Commit 367284f

Browse files
danny-avilajustinmdickey
authored andcommitted
🔍 feat: Mistral OCR API / Upload Files as Text (danny-avila#6274)
* refactor: move `loadAuthValues` to `~/services/Tools/credentials` * feat: add createAxiosInstance function to configure axios with proxy support * WIP: First pass mistral ocr * refactor: replace getConvoFiles with getToolFiles for improved file retrieval logic * refactor: improve document formatting in encodeAndFormat function * refactor: remove unused resendFiles parameter from buildOptions function (this option comes from the agent config) * fix: update getFiles call to include files with `text` property as well * refactor: move file handling to `initializeAgentOptions` * refactor: enhance addImageURLs method to handle OCR text and improve message formatting * refactor: update message formatting to handle OCR text in various content types * refactor: remove unused resendFiles property from compactAgentsSchema * fix: add error handling for Mistral OCR document upload and logging * refactor: integrate OCR capability into file upload options and configuration * refactor: skip processing for text source files in delete request, as they are directly tied to database * feat: add metadata field to ExtendedFile type and update PanelColumns and PanelTable components for localization and metadata handling * fix: source icon styling * wip: first pass, frontend file context agent resources * refactor: add hover card with contextual information for File Context (OCR) in FileContext component * feat: enhance file processing by integrating file retrieval for OCR resources in agent initialization * feat: implement OCR config; fix: agent resource deletion for ocr files * feat: enhance agent initialization by adding OCR capability check in resource priming * ci: fix `~/config` module mock * ci: add OCR property expectation in AppService tests * refactor: simplify OCR config loading by removing environment variable extraction, to be done when OCR is actually performed * ci: add unit test to ensure environment variable references are not parsed in OCR config * refactor: disable base64 image inclusion in OCR request * refactor: enhance OCR configuration handling by validating environment variables and providing defaults * refactor: use file stream from disk for mistral ocr api
1 parent 9b5c2fa commit 367284f

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

48 files changed

+1621
-131
lines changed

api/app/clients/BaseClient.js

+7-3
Original file line numberDiff line numberDiff line change
@@ -1121,9 +1121,13 @@ class BaseClient {
11211121
return message;
11221122
}
11231123

1124-
const files = await getFiles({
1125-
file_id: { $in: fileIds },
1126-
});
1124+
const files = await getFiles(
1125+
{
1126+
file_id: { $in: fileIds },
1127+
},
1128+
{},
1129+
{},
1130+
);
11271131

11281132
await this.addImageURLs(message, files, this.visionMode);
11291133

api/app/clients/tools/util/handleTools.js

+1-40
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ const {
2121
} = require('../');
2222
const { primeFiles: primeCodeFiles } = require('~/server/services/Files/Code/process');
2323
const { createFileSearchTool, primeFiles: primeSearchFiles } = require('./fileSearch');
24+
const { loadAuthValues } = require('~/server/services/Tools/credentials');
2425
const { createMCPTool } = require('~/server/services/MCP');
2526
const { loadSpecs } = require('./loadSpecs');
2627
const { logger } = require('~/config');
@@ -90,45 +91,6 @@ const validateTools = async (user, tools = []) => {
9091
}
9192
};
9293

93-
const loadAuthValues = async ({ userId, authFields, throwError = true }) => {
94-
let authValues = {};
95-
96-
/**
97-
* Finds the first non-empty value for the given authentication field, supporting alternate fields.
98-
* @param {string[]} fields Array of strings representing the authentication fields. Supports alternate fields delimited by "||".
99-
* @returns {Promise<{ authField: string, authValue: string} | null>} An object containing the authentication field and value, or null if not found.
100-
*/
101-
const findAuthValue = async (fields) => {
102-
for (const field of fields) {
103-
let value = process.env[field];
104-
if (value) {
105-
return { authField: field, authValue: value };
106-
}
107-
try {
108-
value = await getUserPluginAuthValue(userId, field, throwError);
109-
} catch (err) {
110-
if (field === fields[fields.length - 1] && !value) {
111-
throw err;
112-
}
113-
}
114-
if (value) {
115-
return { authField: field, authValue: value };
116-
}
117-
}
118-
return null;
119-
};
120-
121-
for (let authField of authFields) {
122-
const fields = authField.split('||');
123-
const result = await findAuthValue(fields);
124-
if (result) {
125-
authValues[result.authField] = result.authValue;
126-
}
127-
}
128-
129-
return authValues;
130-
};
131-
13294
/** @typedef {typeof import('@langchain/core/tools').Tool} ToolConstructor */
13395
/** @typedef {import('@langchain/core/tools').Tool} Tool */
13496

@@ -348,7 +310,6 @@ const loadTools = async ({
348310

349311
module.exports = {
350312
loadToolWithAuth,
351-
loadAuthValues,
352313
validateTools,
353314
loadTools,
354315
};

api/app/clients/tools/util/index.js

+1-2
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,8 @@
1-
const { validateTools, loadTools, loadAuthValues } = require('./handleTools');
1+
const { validateTools, loadTools } = require('./handleTools');
22
const handleOpenAIErrors = require('./handleOpenAIErrors');
33

44
module.exports = {
55
handleOpenAIErrors,
6-
loadAuthValues,
76
validateTools,
87
loadTools,
98
};

api/config/index.js

+16
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
const axios = require('axios');
12
const { EventSource } = require('eventsource');
23
const { Time, CacheKeys } = require('librechat-data-provider');
34
const logger = require('./winston');
@@ -47,9 +48,24 @@ const sendEvent = (res, event) => {
4748
res.write(`event: message\ndata: ${JSON.stringify(event)}\n\n`);
4849
};
4950

51+
function createAxiosInstance() {
52+
const instance = axios.create();
53+
54+
if (process.env.proxy) {
55+
const url = new URL(process.env.proxy);
56+
instance.defaults.proxy = {
57+
host: url.hostname,
58+
protocol: url.protocol.replace(':', ''),
59+
};
60+
}
61+
62+
return instance;
63+
}
64+
5065
module.exports = {
5166
logger,
5267
sendEvent,
5368
getMCPManager,
69+
createAxiosInstance,
5470
getFlowStateManager,
5571
};

api/models/Conversation.js

+38-14
Original file line numberDiff line numberDiff line change
@@ -15,19 +15,6 @@ const searchConversation = async (conversationId) => {
1515
throw new Error('Error searching conversation');
1616
}
1717
};
18-
/**
19-
* Searches for a conversation by conversationId and returns associated file ids.
20-
* @param {string} conversationId - The conversation's ID.
21-
* @returns {Promise<string[] | null>}
22-
*/
23-
const getConvoFiles = async (conversationId) => {
24-
try {
25-
return (await Conversation.findOne({ conversationId }, 'files').lean())?.files ?? [];
26-
} catch (error) {
27-
logger.error('[getConvoFiles] Error getting conversation files', error);
28-
throw new Error('Error getting conversation files');
29-
}
30-
};
3118

3219
/**
3320
* Retrieves a single conversation for a given user and conversation ID.
@@ -73,9 +60,46 @@ const deleteNullOrEmptyConversations = async () => {
7360
}
7461
};
7562

63+
/**
64+
* Retrieves files from a conversation that have either embedded=true
65+
* or a metadata.fileIdentifier. Simplified and efficient query.
66+
*
67+
* @param {string} conversationId - The conversation ID
68+
* @returns {Promise<MongoFile[]>} - Filtered array of matching file objects
69+
*/
70+
const getToolFiles = async (conversationId) => {
71+
try {
72+
const [result] = await Conversation.aggregate([
73+
{ $match: { conversationId } },
74+
{
75+
$project: {
76+
files: {
77+
$filter: {
78+
input: '$files',
79+
as: 'file',
80+
cond: {
81+
$or: [
82+
{ $eq: ['$$file.embedded', true] },
83+
{ $ifNull: ['$$file.metadata.fileIdentifier', false] },
84+
],
85+
},
86+
},
87+
},
88+
_id: 0,
89+
},
90+
},
91+
]).exec();
92+
93+
return result?.files || [];
94+
} catch (error) {
95+
logger.error('[getConvoEmbeddedFiles] Error fetching embedded files:', error);
96+
throw new Error('Error fetching embedded files');
97+
}
98+
};
99+
76100
module.exports = {
77101
Conversation,
78-
getConvoFiles,
102+
getToolFiles,
79103
searchConversation,
80104
deleteNullOrEmptyConversations,
81105
/**

api/models/File.js

+4-2
Original file line numberDiff line numberDiff line change
@@ -17,11 +17,13 @@ const findFileById = async (file_id, options = {}) => {
1717
* Retrieves files matching a given filter, sorted by the most recently updated.
1818
* @param {Object} filter - The filter criteria to apply.
1919
* @param {Object} [_sortOptions] - Optional sort parameters.
20+
* @param {Object|String} [selectFields={ text: 0 }] - Fields to include/exclude in the query results.
21+
* Default excludes the 'text' field.
2022
* @returns {Promise<Array<IMongoFile>>} A promise that resolves to an array of file documents.
2123
*/
22-
const getFiles = async (filter, _sortOptions) => {
24+
const getFiles = async (filter, _sortOptions, selectFields = { text: 0 }) => {
2325
const sortOptions = { updatedAt: -1, ..._sortOptions };
24-
return await File.find(filter).sort(sortOptions).lean();
26+
return await File.find(filter).select(selectFields).sort(sortOptions).lean();
2527
};
2628

2729
/**

api/server/controllers/agents/callbacks.js

+1-1
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,8 @@ const {
1010
ChatModelStreamHandler,
1111
} = require('@librechat/agents');
1212
const { processCodeOutput } = require('~/server/services/Files/Code/process');
13+
const { loadAuthValues } = require('~/server/services/Tools/credentials');
1314
const { saveBase64Image } = require('~/server/services/Files/process');
14-
const { loadAuthValues } = require('~/app/clients/tools/util');
1515
const { logger, sendEvent } = require('~/config');
1616

1717
/** @typedef {import('@librechat/agents').Graph} Graph */

api/server/controllers/agents/client.js

+25-2
Original file line numberDiff line numberDiff line change
@@ -223,14 +223,23 @@ class AgentClient extends BaseClient {
223223
};
224224
}
225225

226+
/**
227+
*
228+
* @param {TMessage} message
229+
* @param {Array<MongoFile>} attachments
230+
* @returns {Promise<Array<Partial<MongoFile>>>}
231+
*/
226232
async addImageURLs(message, attachments) {
227-
const { files, image_urls } = await encodeAndFormat(
233+
const { files, text, image_urls } = await encodeAndFormat(
228234
this.options.req,
229235
attachments,
230236
this.options.agent.provider,
231237
VisionModes.agents,
232238
);
233239
message.image_urls = image_urls.length ? image_urls : undefined;
240+
if (text && text.length) {
241+
message.ocr = text;
242+
}
234243
return files;
235244
}
236245

@@ -308,7 +317,21 @@ class AgentClient extends BaseClient {
308317
assistantName: this.options?.modelLabel,
309318
});
310319

311-
const needsTokenCount = this.contextStrategy && !orderedMessages[i].tokenCount;
320+
if (message.ocr && i !== orderedMessages.length - 1) {
321+
if (typeof formattedMessage.content === 'string') {
322+
formattedMessage.content = message.ocr + '\n' + formattedMessage.content;
323+
} else {
324+
const textPart = formattedMessage.content.find((part) => part.type === 'text');
325+
textPart
326+
? (textPart.text = message.ocr + '\n' + textPart.text)
327+
: formattedMessage.content.unshift({ type: 'text', text: message.ocr });
328+
}
329+
} else if (message.ocr && i === orderedMessages.length - 1) {
330+
systemContent = [systemContent, message.ocr].join('\n');
331+
}
332+
333+
const needsTokenCount =
334+
(this.contextStrategy && !orderedMessages[i].tokenCount) || message.ocr;
312335

313336
/* If tokens were never counted, or, is a Vision request and the message has files, count again */
314337
if (needsTokenCount || (this.isVisionModel && (message.image_urls || message.files))) {

api/server/controllers/tools.js

+2-1
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,8 @@ const {
1010
const { processFileURL, uploadImageBuffer } = require('~/server/services/Files/process');
1111
const { processCodeOutput } = require('~/server/services/Files/Code/process');
1212
const { createToolCall, getToolCallsByConvo } = require('~/models/ToolCall');
13-
const { loadAuthValues, loadTools } = require('~/app/clients/tools/util');
13+
const { loadAuthValues } = require('~/server/services/Tools/credentials');
14+
const { loadTools } = require('~/app/clients/tools/util');
1415
const { checkAccess } = require('~/server/middleware');
1516
const { getMessage } = require('~/models/Message');
1617
const { logger } = require('~/config');

api/server/middleware/buildEndpointOption.js

+1-10
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@ const openAI = require('~/server/services/Endpoints/openAI');
1010
const agents = require('~/server/services/Endpoints/agents');
1111
const custom = require('~/server/services/Endpoints/custom');
1212
const google = require('~/server/services/Endpoints/google');
13-
const { getConvoFiles } = require('~/models/Conversation');
1413
const { handleError } = require('~/server/utils');
1514

1615
const buildFunction = {
@@ -87,16 +86,8 @@ async function buildEndpointOption(req, res, next) {
8786

8887
// TODO: use `getModelsConfig` only when necessary
8988
const modelsConfig = await getModelsConfig(req);
90-
const { resendFiles = true } = req.body.endpointOption;
9189
req.body.endpointOption.modelsConfig = modelsConfig;
92-
if (isAgents && resendFiles && req.body.conversationId) {
93-
const fileIds = await getConvoFiles(req.body.conversationId);
94-
const requestFiles = req.body.files ?? [];
95-
if (requestFiles.length || fileIds.length) {
96-
req.body.endpointOption.attachments = processFiles(requestFiles, fileIds);
97-
}
98-
} else if (req.body.files) {
99-
// hold the promise
90+
if (req.body.files && !isAgents) {
10091
req.body.endpointOption.attachments = processFiles(req.body.files);
10192
}
10293
next();

api/server/routes/files/files.js

+1-1
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ const {
1616
} = require('~/server/services/Files/process');
1717
const { getStrategyFunctions } = require('~/server/services/Files/strategies');
1818
const { getOpenAIClient } = require('~/server/controllers/assistants/helpers');
19-
const { loadAuthValues } = require('~/app/clients/tools/util');
19+
const { loadAuthValues } = require('~/server/services/Tools/credentials');
2020
const { getAgent } = require('~/models/Agent');
2121
const { getFiles } = require('~/models/File');
2222
const { logger } = require('~/config');

api/server/services/AppService.js

+8-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,9 @@
1-
const { FileSources, EModelEndpoint, getConfigDefaults } = require('librechat-data-provider');
1+
const {
2+
FileSources,
3+
EModelEndpoint,
4+
loadOCRConfig,
5+
getConfigDefaults,
6+
} = require('librechat-data-provider');
27
const { checkVariables, checkHealth, checkConfig, checkAzureVariables } = require('./start/checks');
38
const { azureAssistantsDefaults, assistantsConfigSetup } = require('./start/assistants');
49
const { initializeFirebase } = require('./Files/Firebase/initialize');
@@ -25,6 +30,7 @@ const AppService = async (app) => {
2530
const config = (await loadCustomConfig()) ?? {};
2631
const configDefaults = getConfigDefaults();
2732

33+
const ocr = loadOCRConfig(config.ocr);
2834
const filteredTools = config.filteredTools;
2935
const includedTools = config.includedTools;
3036
const fileStrategy = config.fileStrategy ?? configDefaults.fileStrategy;
@@ -57,6 +63,7 @@ const AppService = async (app) => {
5763
const interfaceConfig = await loadDefaultInterface(config, configDefaults);
5864

5965
const defaultLocals = {
66+
ocr,
6067
paths,
6168
fileStrategy,
6269
socialLogins,

api/server/services/AppService.spec.js

+30
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,7 @@ describe('AppService', () => {
120120
},
121121
},
122122
paths: expect.anything(),
123+
ocr: expect.anything(),
123124
imageOutputType: expect.any(String),
124125
fileConfig: undefined,
125126
secureImageLinks: undefined,
@@ -588,4 +589,33 @@ describe('AppService updating app.locals and issuing warnings', () => {
588589
);
589590
});
590591
});
592+
593+
it('should not parse environment variable references in OCR config', async () => {
594+
// Mock custom configuration with env variable references in OCR config
595+
const mockConfig = {
596+
ocr: {
597+
apiKey: '${OCR_API_KEY_CUSTOM_VAR_NAME}',
598+
baseURL: '${OCR_BASEURL_CUSTOM_VAR_NAME}',
599+
strategy: 'mistral_ocr',
600+
mistralModel: 'mistral-medium',
601+
},
602+
};
603+
604+
require('./Config/loadCustomConfig').mockImplementationOnce(() => Promise.resolve(mockConfig));
605+
606+
// Set actual environment variables with different values
607+
process.env.OCR_API_KEY_CUSTOM_VAR_NAME = 'actual-api-key';
608+
process.env.OCR_BASEURL_CUSTOM_VAR_NAME = 'https://actual-ocr-url.com';
609+
610+
// Initialize app
611+
const app = { locals: {} };
612+
await AppService(app);
613+
614+
// Verify that the raw string references were preserved and not interpolated
615+
expect(app.locals.ocr).toBeDefined();
616+
expect(app.locals.ocr.apiKey).toEqual('${OCR_API_KEY_CUSTOM_VAR_NAME}');
617+
expect(app.locals.ocr.baseURL).toEqual('${OCR_BASEURL_CUSTOM_VAR_NAME}');
618+
expect(app.locals.ocr.strategy).toEqual('mistral_ocr');
619+
expect(app.locals.ocr.mistralModel).toEqual('mistral-medium');
620+
});
591621
});

0 commit comments

Comments
 (0)