feat: Ability to combine multiple configs into a single eval (promptfoo#285)

typpo · web-flow · commit b436d007fa36 · 2023-11-19T15:28:53.000-08:00
diff --git a/README.md b/README.md
@@ -161,7 +161,7 @@ If you're looking to customize your usage, you have a wide set of parameters at
 | `-r, --providers <name or path...>` | One of: openai:chat, openai:completion, openai:model-name, localai:chat:model-name, localai:completion:model-name. See [API providers][providers-docs]                             |
 | `-o, --output <path>`               | Path to [output file](https://promptfoo.dev/docs/configuration/parameters#output-file) (csv, json, yaml, html)                                                                     |
 | `--tests <path>`                    | Path to [external test file](https://promptfoo.dev/docs/configurationexpected-outputsassertions#load-an-external-tests-file)                                                       |
-| `-c, --config <path>`               | Path to [configuration file](https://promptfoo.dev/docs/configuration/guide). `promptfooconfig.js/json/yaml` is automatically loaded if present                                    |
+| `-c, --config <paths>`              | Path to one or more [configuration files](https://promptfoo.dev/docs/configuration/guide). `promptfooconfig.js/json/yaml` is automatically loaded if present                       |
 | `-j, --max-concurrency <number>`    | Maximum number of concurrent API calls                                                                                                                                             |
 | `--table-cell-max-length <number>`  | Truncate console table cells to this length                                                                                                                                        |
 | `--prompt-prefix <path>`            | This prefix is prepended to every prompt                                                                                                                                           |
diff --git a/src/main.ts b/src/main.ts
@@ -16,13 +16,13 @@ import {
   cleanupOldResults,
   maybeReadConfig,
   printBorder,
-  readConfig,
+  readConfigs,
   readFilters,
   readLatestResults,
+  setConfigDirectoryPath,
   writeLatestResults,
-  writeOutput,
   writeMultipleOutputs,
-  setConfigDirectoryPath,
+  writeOutput,
 } from './util';
 import { DEFAULT_README, DEFAULT_YAML_CONFIG, DEFAULT_PROMPTS } from './onboarding';
 import { disableCache, clearCache } from './cache';
@@ -218,7 +218,7 @@ async function main() {
       'One of: openai:chat, openai:completion, openai:<model name>, or path to custom API caller module',
     )
     .option(
-      '-c, --config <path>',
+      '-c, --config <paths...>',
       'Path to configuration file. Automatically loads promptfooconfig.js/json/yaml',
     )
     .option(
@@ -297,13 +297,13 @@ async function main() {
 
       // Config parsing
       let fileConfig: Partial<UnifiedConfig> = {};
-      const configPath = cmdObj.config;
-      if (configPath) {
-        fileConfig = await readConfig(configPath);
+      const configPaths = cmdObj.config;
+      if (configPaths) {
+        fileConfig = await readConfigs(configPaths);
       }
 
       // Use basepath in cases where path was supplied in the config file
-      const basePath = configPath ? dirname(configPath) : '';
+      const basePath = configPaths ? dirname(configPaths[0]) : '';
 
       const defaultTestRaw = fileConfig.defaultTest || defaultConfig.defaultTest;
       const config: Partial<UnifiedConfig> = {
@@ -335,7 +335,7 @@ async function main() {
       const parsedPrompts = readPrompts(config.prompts, cmdObj.prompts ? undefined : basePath);
       const parsedProviders = await loadApiProviders(config.providers, { basePath });
       const parsedTests: TestCase[] = await readTests(
-        config.tests,
+        config.tests || [],
         cmdObj.tests ? undefined : basePath,
       );
 
diff --git a/src/providers.ts b/src/providers.ts
@@ -37,15 +37,11 @@ import type {
   ProviderFunction,
   ProviderId,
   ProviderOptionsMap,
+  TestSuiteConfig,
 } from './types';
 
 export async function loadApiProviders(
-  providerPaths:
-    | ProviderId
-    | ProviderId[]
-    | ProviderOptionsMap[]
-    | ProviderOptions[]
-    | ProviderFunction,
+  providerPaths: TestSuiteConfig['providers'],
   options: {
     basePath?: string;
     env?: EnvOverrides;
diff --git a/src/testCases.ts b/src/testCases.ts
@@ -8,7 +8,7 @@ import { globSync } from 'glob';
 
 import { fetchCsvFromGoogleSheet } from './fetch';
 
-import type { Assertion, CsvRow, TestCase } from './types';
+import type { Assertion, CsvRow, TestCase, TestSuiteConfig } from './types';
 
 function parseJson(json: string): any | undefined {
   try {
@@ -123,7 +123,7 @@ export async function readTest(
 }
 
 export async function readTests(
-  tests: string | string[] | TestCase[] | undefined,
+  tests: TestSuiteConfig['tests'],
   basePath: string = '',
 ): Promise<TestCase[]> {
   const ret: TestCase[] = [];
diff --git a/src/types.ts b/src/types.ts
@@ -14,7 +14,7 @@ export interface CommandLineOptions {
   // Command line only
   vars?: FilePath;
   tests?: FilePath;
-  config?: FilePath;
+  config?: FilePath[];
   verbose?: boolean;
   grader?: string;
   view?: string;
@@ -297,15 +297,15 @@ export interface TestCasesWithMetadataPrompt {
 
 export interface TestCasesWithMetadata {
   id: string;
-  testCases: FilePath | FilePath[] | TestCase[];
+  testCases: FilePath | (FilePath | TestCase)[];
   recentEvalDate: Date;
   recentEvalId: string;
   recentEvalFilepath: FilePath;
   count: number;
   prompts: TestCasesWithMetadataPrompt[];
 }
 
-// Each test case is graded pass/fail.  A test case represents a unique input to the LLM after substituting `vars` in the prompt.
+// Each test case is graded pass/fail with a score.  A test case represents a unique input to the LLM after substituting `vars` in the prompt.
 export interface TestCase<Vars = Record<string, string | string[] | object>> {
   // Optional description of what you're testing
   description?: string;
@@ -384,18 +384,13 @@ export interface TestSuiteConfig {
   description?: string;
 
   // One or more LLM APIs to use, for example: openai:gpt-3.5-turbo, openai:gpt-4, localai:chat:vicuna
-  providers:
-    | ProviderId
-    | ProviderId[]
-    | ProviderOptionsMap[]
-    | ProviderOptions[]
-    | ProviderFunction;
+  providers: ProviderId | ProviderFunction | (ProviderId | ProviderOptionsMap | ProviderOptions)[];
 
   // One or more prompt files to load
   prompts: FilePath | FilePath[];
 
   // Path to a test file, OR list of LLM prompt variations (aka "test case")
-  tests: FilePath | FilePath[] | TestCase[];
+  tests: FilePath | (FilePath | TestCase)[];
 
   // Scenarios, groupings of data and tests to be evaluated
   scenarios?: Scenario[];
diff --git a/src/util.ts b/src/util.ts
@@ -24,6 +24,8 @@ import type {
   TestCasesWithMetadataPrompt,
   UnifiedConfig,
 } from './types';
+import invariant from 'tiny-invariant';
+import { readPrompts } from './prompts';
 
 let globalConfigCache: any = null;
 
@@ -91,6 +93,87 @@ export async function readConfig(configPath: string): Promise<UnifiedConfig> {
   }
 }
 
+export async function readConfigs(configPaths: string[]): Promise<UnifiedConfig> {
+  const configs: UnifiedConfig[] = [];
+  for (const configPath of configPaths) {
+    const globPaths = globSync(configPath);
+    for (const globPath of globPaths) {
+      const config = await readConfig(globPath);
+      configs.push(config);
+    }
+  }
+
+  const providers: UnifiedConfig['providers'] = [];
+  const seenProviders = new Set<string>();
+  configs.forEach((config) => {
+    invariant(
+      typeof config.providers !== 'function',
+      'Providers cannot be a function for multiple configs',
+    );
+    if (typeof config.providers === 'string') {
+      if (!seenProviders.has(config.providers)) {
+        providers.push(config.providers);
+        seenProviders.add(config.providers);
+      }
+    } else if (Array.isArray(config.providers)) {
+      config.providers.forEach((provider) => {
+        if (!seenProviders.has(JSON.stringify(provider))) {
+          providers.push(provider);
+          seenProviders.add(JSON.stringify(provider));
+        }
+      });
+    }
+  });
+
+  const tests: UnifiedConfig['tests'] = [];
+  configs.forEach((config) => {
+    if (typeof config.tests === 'string') {
+      tests.push(config.tests);
+    } else if (Array.isArray(config.tests)) {
+      tests.push(...config.tests);
+    }
+  });
+
+  const prompts: UnifiedConfig['prompts'] = [];
+  const seenPrompts = new Set<string>();
+  configs.forEach((config, idx) => {
+    const ps = readPrompts(config.prompts, path.dirname(configPaths[idx]));
+    ps.forEach((prompt) => {
+      if (!seenPrompts.has(prompt.raw)) {
+        prompts.push(prompt.raw);
+        seenPrompts.add(prompt.raw);
+      }
+    });
+  });
+
+  // Combine all configs into a single UnifiedConfig
+  const combinedConfig: UnifiedConfig = {
+    description: configs.map((config) => config.description).join(', '),
+    providers,
+    prompts,
+    tests,
+    scenarios: configs.flatMap((config) => config.scenarios || []),
+    defaultTest: configs.reduce((prev: Partial<TestCase> | undefined, curr) => {
+      return {
+        ...prev,
+        ...curr.defaultTest,
+        vars: { ...prev?.vars, ...curr.defaultTest?.vars },
+        assert: [...(prev?.assert || []), ...(curr.defaultTest?.assert || [])],
+        options: { ...prev?.options, ...curr.defaultTest?.options },
+      };
+    }, {}),
+    nunjucksFilters: configs.reduce((prev, curr) => ({ ...prev, ...curr.nunjucksFilters }), {}),
+    env: configs.reduce((prev, curr) => ({ ...prev, ...curr.env }), {}),
+    evaluateOptions: configs.reduce((prev, curr) => ({ ...prev, ...curr.evaluateOptions }), {}),
+    commandLineOptions: configs.reduce(
+      (prev, curr) => ({ ...prev, ...curr.commandLineOptions }),
+      {},
+    ),
+  };
+
+  return combinedConfig;
+}
+
 export function writeMultipleOutputs(
   outputPaths: string[],
   results: EvaluateSummary,
diff --git a/test/testCases.test.ts b/test/testCases.test.ts
@@ -139,11 +139,6 @@ describe('readTests', () => {
     jest.resetAllMocks();
   });
 
-  test('readTests with no input', async () => {
-    const result = await readTests(undefined);
-    expect(result).toEqual([]);
-  });
-
   test('readTests with string input (CSV file path)', async () => {
     (fs.readFileSync as jest.Mock).mockReturnValue(
       'var1,var2,__expected\nvalue1,value2,value1\nvalue3,value4,fn:value5',
diff --git a/test/util.test.ts b/test/util.test.ts
@@ -12,6 +12,7 @@ import {
   maybeRecordFirstRun,
   resetGlobalConfig,
   readFilters,
+  readConfigs,
 } from '../src/util';
 
 import type { EvaluateResult, EvaluateTable } from '../src/types';
@@ -348,4 +349,86 @@ describe('util', () => {
 
     expect(filters.testFilter).toBe(mockFilter);
   });
+
+  describe('readConfigs', () => {
+    test('reads from existing configs', async () => {
+      const config1 = {
+        description: 'test1',
+        providers: ['provider1'],
+        prompts: ['prompt1'],
+        tests: ['test1'],
+        scenarios: ['scenario1'],
+        defaultTest: {
+          description: 'defaultTest1',
+          vars: { var1: 'value1' },
+          assert: [{ type: 'equals', value: 'expected1' }],
+        },
+        nunjucksFilters: { filter1: 'filter1' },
+        env: { envVar1: 'envValue1' },
+        evaluateOptions: { maxConcurrency: 1 },
+        commandLineOptions: { verbose: true },
+      };
+      const config2 = {
+        description: 'test2',
+        providers: ['provider2'],
+        prompts: ['prompt2'],
+        tests: ['test2'],
+        scenarios: ['scenario2'],
+        defaultTest: {
+          description: 'defaultTest2',
+          vars: { var2: 'value2' },
+          assert: [{ type: 'equals', value: 'expected2' }],
+        },
+        nunjucksFilters: { filter2: 'filter2' },
+        env: { envVar2: 'envValue2' },
+        evaluateOptions: { maxConcurrency: 2 },
+        commandLineOptions: { verbose: false },
+      };
+
+      (globSync as jest.Mock).mockImplementation((pathOrGlob) => [pathOrGlob]);
+      (fs.readFileSync as jest.Mock)
+        .mockReturnValueOnce(JSON.stringify(config1))
+        .mockReturnValueOnce(JSON.stringify(config2))
+        .mockReturnValue('you should not see this');
+
+      // Mocks for prompt loading
+      (fs.readdirSync as jest.Mock).mockReturnValue([]);
+      (fs.statSync as jest.Mock).mockImplementation(() => {
+        throw new Error('File does not exist');
+      });
+
+      const result = await readConfigs(['config1.json', 'config2.json']);
+
+      expect(fs.readFileSync).toHaveBeenCalledTimes(2);
+      expect(fs.statSync).toHaveBeenCalledTimes(2);
+      expect(result).toEqual({
+        description: 'test1, test2',
+        providers: ['provider1', 'provider2'],
+        prompts: ['prompt1', 'prompt2'],
+        tests: ['test1', 'test2'],
+        scenarios: ['scenario1', 'scenario2'],
+        defaultTest: {
+          description: 'defaultTest2',
+          options: {},
+          vars: { var1: 'value1', var2: 'value2' },
+          assert: [
+            { type: 'equals', value: 'expected1' },
+            { type: 'equals', value: 'expected2' },
+          ],
+        },
+        nunjucksFilters: { filter1: 'filter1', filter2: 'filter2' },
+        env: { envVar1: 'envValue1', envVar2: 'envValue2' },
+        evaluateOptions: { maxConcurrency: 2 },
+        commandLineOptions: { verbose: false },
+      });
+    });
+
+    test('throws error for unsupported configuration file format', async () => {
+      (fs.existsSync as jest.Mock).mockReturnValue(true);
+
+      await expect(readConfigs(['config1.unsupported'])).rejects.toThrow(
+        'Unsupported configuration file format: .unsupported',
+      );
+    });
+  });
 });