Skip to content

Commit b436d00

Browse files
authored
feat: Ability to combine multiple configs into a single eval (promptfoo#285)
1 parent 3f3208d commit b436d00

8 files changed

+185
-33
lines changed

README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -161,7 +161,7 @@ If you're looking to customize your usage, you have a wide set of parameters at
161161
| `-r, --providers <name or path...>` | One of: openai:chat, openai:completion, openai:model-name, localai:chat:model-name, localai:completion:model-name. See [API providers][providers-docs] |
162162
| `-o, --output <path>` | Path to [output file](https://promptfoo.dev/docs/configuration/parameters#output-file) (csv, json, yaml, html) |
163163
| `--tests <path>` | Path to [external test file](https://promptfoo.dev/docs/configurationexpected-outputsassertions#load-an-external-tests-file) |
164-
| `-c, --config <path>` | Path to [configuration file](https://promptfoo.dev/docs/configuration/guide). `promptfooconfig.js/json/yaml` is automatically loaded if present |
164+
| `-c, --config <paths>` | Path to one or more [configuration files](https://promptfoo.dev/docs/configuration/guide). `promptfooconfig.js/json/yaml` is automatically loaded if present |
165165
| `-j, --max-concurrency <number>` | Maximum number of concurrent API calls |
166166
| `--table-cell-max-length <number>` | Truncate console table cells to this length |
167167
| `--prompt-prefix <path>` | This prefix is prepended to every prompt |

src/main.ts

+9-9
Original file line numberDiff line numberDiff line change
@@ -16,13 +16,13 @@ import {
1616
cleanupOldResults,
1717
maybeReadConfig,
1818
printBorder,
19-
readConfig,
19+
readConfigs,
2020
readFilters,
2121
readLatestResults,
22+
setConfigDirectoryPath,
2223
writeLatestResults,
23-
writeOutput,
2424
writeMultipleOutputs,
25-
setConfigDirectoryPath,
25+
writeOutput,
2626
} from './util';
2727
import { DEFAULT_README, DEFAULT_YAML_CONFIG, DEFAULT_PROMPTS } from './onboarding';
2828
import { disableCache, clearCache } from './cache';
@@ -218,7 +218,7 @@ async function main() {
218218
'One of: openai:chat, openai:completion, openai:<model name>, or path to custom API caller module',
219219
)
220220
.option(
221-
'-c, --config <path>',
221+
'-c, --config <paths...>',
222222
'Path to configuration file. Automatically loads promptfooconfig.js/json/yaml',
223223
)
224224
.option(
@@ -297,13 +297,13 @@ async function main() {
297297

298298
// Config parsing
299299
let fileConfig: Partial<UnifiedConfig> = {};
300-
const configPath = cmdObj.config;
301-
if (configPath) {
302-
fileConfig = await readConfig(configPath);
300+
const configPaths = cmdObj.config;
301+
if (configPaths) {
302+
fileConfig = await readConfigs(configPaths);
303303
}
304304

305305
// Use basepath in cases where path was supplied in the config file
306-
const basePath = configPath ? dirname(configPath) : '';
306+
const basePath = configPaths ? dirname(configPaths[0]) : '';
307307

308308
const defaultTestRaw = fileConfig.defaultTest || defaultConfig.defaultTest;
309309
const config: Partial<UnifiedConfig> = {
@@ -335,7 +335,7 @@ async function main() {
335335
const parsedPrompts = readPrompts(config.prompts, cmdObj.prompts ? undefined : basePath);
336336
const parsedProviders = await loadApiProviders(config.providers, { basePath });
337337
const parsedTests: TestCase[] = await readTests(
338-
config.tests,
338+
config.tests || [],
339339
cmdObj.tests ? undefined : basePath,
340340
);
341341

src/providers.ts

+2-6
Original file line numberDiff line numberDiff line change
@@ -37,15 +37,11 @@ import type {
3737
ProviderFunction,
3838
ProviderId,
3939
ProviderOptionsMap,
40+
TestSuiteConfig,
4041
} from './types';
4142

4243
export async function loadApiProviders(
43-
providerPaths:
44-
| ProviderId
45-
| ProviderId[]
46-
| ProviderOptionsMap[]
47-
| ProviderOptions[]
48-
| ProviderFunction,
44+
providerPaths: TestSuiteConfig['providers'],
4945
options: {
5046
basePath?: string;
5147
env?: EnvOverrides;

src/testCases.ts

+2-2
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ import { globSync } from 'glob';
88

99
import { fetchCsvFromGoogleSheet } from './fetch';
1010

11-
import type { Assertion, CsvRow, TestCase } from './types';
11+
import type { Assertion, CsvRow, TestCase, TestSuiteConfig } from './types';
1212

1313
function parseJson(json: string): any | undefined {
1414
try {
@@ -123,7 +123,7 @@ export async function readTest(
123123
}
124124

125125
export async function readTests(
126-
tests: string | string[] | TestCase[] | undefined,
126+
tests: TestSuiteConfig['tests'],
127127
basePath: string = '',
128128
): Promise<TestCase[]> {
129129
const ret: TestCase[] = [];

src/types.ts

+5-10
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ export interface CommandLineOptions {
1414
// Command line only
1515
vars?: FilePath;
1616
tests?: FilePath;
17-
config?: FilePath;
17+
config?: FilePath[];
1818
verbose?: boolean;
1919
grader?: string;
2020
view?: string;
@@ -297,15 +297,15 @@ export interface TestCasesWithMetadataPrompt {
297297

298298
export interface TestCasesWithMetadata {
299299
id: string;
300-
testCases: FilePath | FilePath[] | TestCase[];
300+
testCases: FilePath | (FilePath | TestCase)[];
301301
recentEvalDate: Date;
302302
recentEvalId: string;
303303
recentEvalFilepath: FilePath;
304304
count: number;
305305
prompts: TestCasesWithMetadataPrompt[];
306306
}
307307

308-
// Each test case is graded pass/fail. A test case represents a unique input to the LLM after substituting `vars` in the prompt.
308+
// Each test case is graded pass/fail with a score. A test case represents a unique input to the LLM after substituting `vars` in the prompt.
309309
export interface TestCase<Vars = Record<string, string | string[] | object>> {
310310
// Optional description of what you're testing
311311
description?: string;
@@ -384,18 +384,13 @@ export interface TestSuiteConfig {
384384
description?: string;
385385

386386
// One or more LLM APIs to use, for example: openai:gpt-3.5-turbo, openai:gpt-4, localai:chat:vicuna
387-
providers:
388-
| ProviderId
389-
| ProviderId[]
390-
| ProviderOptionsMap[]
391-
| ProviderOptions[]
392-
| ProviderFunction;
387+
providers: ProviderId | ProviderFunction | (ProviderId | ProviderOptionsMap | ProviderOptions)[];
393388

394389
// One or more prompt files to load
395390
prompts: FilePath | FilePath[];
396391

397392
// Path to a test file, OR list of LLM prompt variations (aka "test case")
398-
tests: FilePath | FilePath[] | TestCase[];
393+
tests: FilePath | (FilePath | TestCase)[];
399394

400395
// Scenarios, groupings of data and tests to be evaluated
401396
scenarios?: Scenario[];

src/util.ts

+83
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@ import type {
2424
TestCasesWithMetadataPrompt,
2525
UnifiedConfig,
2626
} from './types';
27+
import invariant from 'tiny-invariant';
28+
import { readPrompts } from './prompts';
2729

2830
let globalConfigCache: any = null;
2931

@@ -91,6 +93,87 @@ export async function readConfig(configPath: string): Promise<UnifiedConfig> {
9193
}
9294
}
9395

96+
export async function readConfigs(configPaths: string[]): Promise<UnifiedConfig> {
97+
const configs: UnifiedConfig[] = [];
98+
for (const configPath of configPaths) {
99+
const globPaths = globSync(configPath);
100+
for (const globPath of globPaths) {
101+
const config = await readConfig(globPath);
102+
configs.push(config);
103+
}
104+
}
105+
106+
const providers: UnifiedConfig['providers'] = [];
107+
const seenProviders = new Set<string>();
108+
configs.forEach((config) => {
109+
invariant(
110+
typeof config.providers !== 'function',
111+
'Providers cannot be a function for multiple configs',
112+
);
113+
if (typeof config.providers === 'string') {
114+
if (!seenProviders.has(config.providers)) {
115+
providers.push(config.providers);
116+
seenProviders.add(config.providers);
117+
}
118+
} else if (Array.isArray(config.providers)) {
119+
config.providers.forEach((provider) => {
120+
if (!seenProviders.has(JSON.stringify(provider))) {
121+
providers.push(provider);
122+
seenProviders.add(JSON.stringify(provider));
123+
}
124+
});
125+
}
126+
});
127+
128+
const tests: UnifiedConfig['tests'] = [];
129+
configs.forEach((config) => {
130+
if (typeof config.tests === 'string') {
131+
tests.push(config.tests);
132+
} else if (Array.isArray(config.tests)) {
133+
tests.push(...config.tests);
134+
}
135+
});
136+
137+
const prompts: UnifiedConfig['prompts'] = [];
138+
const seenPrompts = new Set<string>();
139+
configs.forEach((config, idx) => {
140+
const ps = readPrompts(config.prompts, path.dirname(configPaths[idx]));
141+
ps.forEach((prompt) => {
142+
if (!seenPrompts.has(prompt.raw)) {
143+
prompts.push(prompt.raw);
144+
seenPrompts.add(prompt.raw);
145+
}
146+
});
147+
});
148+
149+
// Combine all configs into a single UnifiedConfig
150+
const combinedConfig: UnifiedConfig = {
151+
description: configs.map((config) => config.description).join(', '),
152+
providers,
153+
prompts,
154+
tests,
155+
scenarios: configs.flatMap((config) => config.scenarios || []),
156+
defaultTest: configs.reduce((prev: Partial<TestCase> | undefined, curr) => {
157+
return {
158+
...prev,
159+
...curr.defaultTest,
160+
vars: { ...prev?.vars, ...curr.defaultTest?.vars },
161+
assert: [...(prev?.assert || []), ...(curr.defaultTest?.assert || [])],
162+
options: { ...prev?.options, ...curr.defaultTest?.options },
163+
};
164+
}, {}),
165+
nunjucksFilters: configs.reduce((prev, curr) => ({ ...prev, ...curr.nunjucksFilters }), {}),
166+
env: configs.reduce((prev, curr) => ({ ...prev, ...curr.env }), {}),
167+
evaluateOptions: configs.reduce((prev, curr) => ({ ...prev, ...curr.evaluateOptions }), {}),
168+
commandLineOptions: configs.reduce(
169+
(prev, curr) => ({ ...prev, ...curr.commandLineOptions }),
170+
{},
171+
),
172+
};
173+
174+
return combinedConfig;
175+
}
176+
94177
export function writeMultipleOutputs(
95178
outputPaths: string[],
96179
results: EvaluateSummary,

test/testCases.test.ts

-5
Original file line numberDiff line numberDiff line change
@@ -139,11 +139,6 @@ describe('readTests', () => {
139139
jest.resetAllMocks();
140140
});
141141

142-
test('readTests with no input', async () => {
143-
const result = await readTests(undefined);
144-
expect(result).toEqual([]);
145-
});
146-
147142
test('readTests with string input (CSV file path)', async () => {
148143
(fs.readFileSync as jest.Mock).mockReturnValue(
149144
'var1,var2,__expected\nvalue1,value2,value1\nvalue3,value4,fn:value5',

test/util.test.ts

+83
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ import {
1212
maybeRecordFirstRun,
1313
resetGlobalConfig,
1414
readFilters,
15+
readConfigs,
1516
} from '../src/util';
1617

1718
import type { EvaluateResult, EvaluateTable } from '../src/types';
@@ -348,4 +349,86 @@ describe('util', () => {
348349

349350
expect(filters.testFilter).toBe(mockFilter);
350351
});
352+
353+
describe('readConfigs', () => {
354+
test('reads from existing configs', async () => {
355+
const config1 = {
356+
description: 'test1',
357+
providers: ['provider1'],
358+
prompts: ['prompt1'],
359+
tests: ['test1'],
360+
scenarios: ['scenario1'],
361+
defaultTest: {
362+
description: 'defaultTest1',
363+
vars: { var1: 'value1' },
364+
assert: [{ type: 'equals', value: 'expected1' }],
365+
},
366+
nunjucksFilters: { filter1: 'filter1' },
367+
env: { envVar1: 'envValue1' },
368+
evaluateOptions: { maxConcurrency: 1 },
369+
commandLineOptions: { verbose: true },
370+
};
371+
const config2 = {
372+
description: 'test2',
373+
providers: ['provider2'],
374+
prompts: ['prompt2'],
375+
tests: ['test2'],
376+
scenarios: ['scenario2'],
377+
defaultTest: {
378+
description: 'defaultTest2',
379+
vars: { var2: 'value2' },
380+
assert: [{ type: 'equals', value: 'expected2' }],
381+
},
382+
nunjucksFilters: { filter2: 'filter2' },
383+
env: { envVar2: 'envValue2' },
384+
evaluateOptions: { maxConcurrency: 2 },
385+
commandLineOptions: { verbose: false },
386+
};
387+
388+
(globSync as jest.Mock).mockImplementation((pathOrGlob) => [pathOrGlob]);
389+
(fs.readFileSync as jest.Mock)
390+
.mockReturnValueOnce(JSON.stringify(config1))
391+
.mockReturnValueOnce(JSON.stringify(config2))
392+
.mockReturnValue('you should not see this');
393+
394+
// Mocks for prompt loading
395+
(fs.readdirSync as jest.Mock).mockReturnValue([]);
396+
(fs.statSync as jest.Mock).mockImplementation(() => {
397+
throw new Error('File does not exist');
398+
});
399+
400+
const result = await readConfigs(['config1.json', 'config2.json']);
401+
402+
expect(fs.readFileSync).toHaveBeenCalledTimes(2);
403+
expect(fs.statSync).toHaveBeenCalledTimes(2);
404+
expect(result).toEqual({
405+
description: 'test1, test2',
406+
providers: ['provider1', 'provider2'],
407+
prompts: ['prompt1', 'prompt2'],
408+
tests: ['test1', 'test2'],
409+
scenarios: ['scenario1', 'scenario2'],
410+
defaultTest: {
411+
description: 'defaultTest2',
412+
options: {},
413+
vars: { var1: 'value1', var2: 'value2' },
414+
assert: [
415+
{ type: 'equals', value: 'expected1' },
416+
{ type: 'equals', value: 'expected2' },
417+
],
418+
},
419+
nunjucksFilters: { filter1: 'filter1', filter2: 'filter2' },
420+
env: { envVar1: 'envValue1', envVar2: 'envValue2' },
421+
evaluateOptions: { maxConcurrency: 2 },
422+
commandLineOptions: { verbose: false },
423+
});
424+
});
425+
426+
test('throws error for unsupported configuration file format', async () => {
427+
(fs.existsSync as jest.Mock).mockReturnValue(true);
428+
429+
await expect(readConfigs(['config1.unsupported'])).rejects.toThrow(
430+
'Unsupported configuration file format: .unsupported',
431+
);
432+
});
433+
});
351434
});

0 commit comments

Comments
 (0)