feat: Multimodal block display + Steamship agent (a16z-infra#49)

eob · web-flow · commit ddba25dce0b2 · 2023-08-14T01:05:47.000-07:00
* Contribute initial multimodal agent support &amp; Steamship agent
diff --git a/.env.local.example b/.env.local.example
@@ -31,4 +31,7 @@ UPSTASH_REDIS_REST_TOKEN=AZ****
 
 # Twilio related environment variables
 TWILIO_ACCOUNT_SID=AC***
-TWILIO_AUTH_TOKEN=*****
+TWILIO_AUTH_TOKEN=*****
+
+# Steamship related environment variables
+STEAMSHIP_API_KEY=****
diff --git a/.gitignore b/.gitignore
@@ -35,4 +35,10 @@ yarn-error.log*
 next-env.d.ts
 
 /.env.prod
-/fly.toml
+/fly.toml
+
+# JetBrains
+.idea
+
+# Yarn Lockfiles (since this project uses NPM)
+yarn.lock
diff --git a/README.md b/README.md
@@ -114,14 +114,24 @@ e. **Upstash API key**
 <img width="866" alt="Screen Shot 2023-07-10 at 11 07 21 PM" src="https://github.com/a16z-infra/companion-app/assets/3489963/f8e6c43f-8810-423e-86b4-9e8aa70598c9">
 
 
-e. **Supabase API key** (optional)
+f. **Supabase API key** (optional)
 If you prefer to use Supabase, you will need to uncomment `VECTOR_DB=supabase` and fill out the Supabase credentials in `.env.local`.
 
 - Create a Supabase instance [here](https://supabase.com/dashboard/projects); then go to Project Settings -> API
 - `SUPABASE_URL` is the URL value under "Project URL"
 - `SUPABASE_PRIVATE_KEY` is the key starts with `ey` under Project API Keys
 - Now, you should enable pgvector on Supabase and create a schema. You can do this easily by clicking on "SQL editor" on the left hand side on Supabase UI and then clicking on "+New Query". Copy paste [this code snippet](https://github.com/a16z-infra/ai-getting-started/blob/main/pgvector.sql) in the SQL editor and click "Run".
 
+g. **Steamship API key**
+
+You can connect a Steamship agent instance as an LLM with personality, voice and image generation capabilities built in. It also includes its own vector storage and tools. To do so:
+
+- Create an account on [Steamship](https://steamship.com/account)
+- Copy the API key from your account settings page
+- Add it as the `STEAMSHIP_API_KEY` variable 
+
+If you'd like to create your own character personality, add a custom voice, or use a different image model, visit [Steamship Agent Guidebook](https://www.steamship.com/learn/agent-guidebook), create your own instance and connect it in `companions.json` using the *Rick* example as a guide.
+
 ### 4. Generate embeddings
 
 The `companions/` directory contains the "personalities" of the AIs in .txt files. To generate embeddings and load them into the vector database to draw from during the chat, run the following command:
diff --git a/companions/companions.json b/companions/companions.json
@@ -20,6 +20,15 @@
     "llm": "vicuna13b",
     "phone": "OPTIONAL_COMPANION_PHONE_NUMBER"
   },
+  {
+    "name": "Rick",
+    "title": "I can generate voice and pictures",
+    "imageUrl": "/rick.jpeg",
+    "llm": "steamship",
+    "generateEndpoint": "https://a16z.steamship.run/rick/ai-companion-59f5d9816b627a45856239ae9f83525e/answer",
+    "phone": "OPTIONAL_COMPANION_PHONE_NUMBER",
+    "telegramLink": "https://t.me/rick_a16z_bot"
+  },
   {
     "name": "Sebastian",
     "title": "I'm a travel blogger and a mystery novel writer",
diff --git a/package-lock.json b/package-lock.json
diff --git a/package.json b/package.json
@@ -38,6 +38,7 @@
     "react-tooltip": "^5.16.1",
     "replicate": "^0.9.3",
     "tailwindcss": "3.3.2",
+    "ts-md5": "^1.3.1",
     "twilio": "^4.12.0",
     "typescript": "5.1.3"
   },
diff --git a/public/rick.jpeg b/public/rick.jpeg
diff --git a/src/app/api/steamship/route.ts b/src/app/api/steamship/route.ts
@@ -0,0 +1,105 @@
+import dotenv from "dotenv";
+import clerk from "@clerk/clerk-sdk-node";
+import { NextResponse } from "next/server";
+import { currentUser } from "@clerk/nextjs";
+import { rateLimit } from "@/app/utils/rateLimit";
+import {Md5} from 'ts-md5'
+import ConfigManager from "@/app/utils/config";
+
+dotenv.config({ path: `.env.local` });
+
+function returnError(code: number, message: string) {
+  return new NextResponse(
+      JSON.stringify({ Message: message }),
+      {
+        status: code,
+        headers: {
+          "Content-Type": "application/json",
+        },
+      }
+  );
+}
+
+export async function POST(req: Request) {
+  let clerkUserId;
+  let user;
+  let clerkUserName;
+  const { prompt, isText, userId, userName } = await req.json();
+  const companionName = req.headers.get("name");
+
+  // Load the companion config
+  const configManager = ConfigManager.getInstance();
+  const companionConfig = configManager.getConfig("name", companionName);
+  if (!companionConfig) {
+    return returnError(404, `Hi, we were unable to find the configuration for a companion named ${companionName}.`)
+  }
+
+  // Make sure we're not rate limited
+  const identifier = req.url + "-" + (userId || "anonymous");
+  const { success } = await rateLimit(identifier);
+  if (!success) {
+    console.log("INFO: rate limit exceeded");
+    return returnError(429, `Hi, the companions can't talk this fast.`)
+  }
+
+  if (!process.env.STEAMSHIP_API_KEY) {
+    return returnError(500, `Please set the STEAMSHIP_API_KEY env variable and make sure ${companionName} is connected to an Agent instance that you own.`)
+  }
+
+  console.log(`Companion Name: ${companionName}`)
+  console.log(`Prompt: ${prompt}`);
+
+  if (isText) {
+    clerkUserId = userId;
+    clerkUserName = userName;
+  } else {
+    user = await currentUser();
+    clerkUserId = user?.id;
+    clerkUserName = user?.firstName;
+  }
+
+  if (!clerkUserId || !!!(await clerk.users.getUser(clerkUserId))) {
+    console.log("user not authorized");
+    return new NextResponse(
+      JSON.stringify({ Message: "User not authorized" }),
+      {
+        status: 401,
+        headers: {
+          "Content-Type": "application/json",
+        },
+      }
+    );
+  }
+
+  // Create a chat session id for the user
+  const chatSessionId = Md5.hashStr(userId || "anonymous");
+
+  // Make sure we have a generate endpoint.
+  // TODO: Create a new instance of the agent per user if this proves advantageous.
+  const agentUrl = companionConfig.generateEndpoint
+  if (!agentUrl) {
+    return returnError(500, `Please add a Steamship 'generateEndpoint' to your ${companionName} configuration in companions.json.`)
+  }
+
+  // Invoke the generation. Tool invocation, chat history management, backstory injection, etc is all done within this endpoint.
+  // To build, deploy, and host your own multi-tenant agent see: https://www.steamship.com/learn/agent-guidebook
+  const response = await fetch(agentUrl, {
+    method: "POST",
+    headers: {
+      "Content-Type": "application/json",
+      "Authorization": `Bearer ${process.env.STEAMSHIP_API_KEY}`
+    },
+    body: JSON.stringify({
+      question: prompt,
+      chat_session_id: chatSessionId
+    })
+  });
+
+  if (response.ok) {
+    const responseText = await response.text()
+    const responseBlocks = JSON.parse(responseText)
+    return NextResponse.json(responseBlocks)
+  } else {
+    return returnError(500, await response.text())
+  }
+}
diff --git a/src/components/ChatBlock.tsx b/src/components/ChatBlock.tsx
@@ -0,0 +1,69 @@
+/*
+ * Represents a unit of multimodal chat: text, video, audio, or image.
+ *
+ * For streaming responses, just update the `text` argument.
+ */
+export function ChatBlock({text, mimeType, url} : {
+    text?: string,
+    mimeType?: string,
+    url?: string
+}) {
+    let internalComponent = <></>
+    if (text) {
+        internalComponent = <span>{text}</span>
+    } else if (mimeType && url) {
+        if (mimeType.startsWith("audio")) {
+            internalComponent = <audio controls={true} src={url} />
+        } else if (mimeType.startsWith("video")) {
+            internalComponent = <video controls width="250">
+                <source src={url} type={mimeType} />
+                Download the <a href={url}>video</a>
+            </video>
+        } else if (mimeType.startsWith("image")) {
+            internalComponent = <img src={url} />
+        }
+    } else if (url) {
+        internalComponent = <a href={url}>Link</a>
+    }
+
+    return (
+        <p className="text-sm text-gray-200 pb-2">
+            {internalComponent}
+        </p>
+    );
+}
+
+/*
+ * Take a completion, which may be a string, JSON encoded as a string, or JSON object,
+ * and produce a list of ChatBlock objects. This is intended to be a one-size-fits-all
+ * method for funneling different LLM output into structure that supports different media
+ * types and can easily grow to support more metadata (such as speaker).
+ */
+export function responseToChatBlocks(completion: any) {
+    // First we try to parse completion as JSON in case we're dealing with an object.
+    console.log("got completoin", completion, typeof completion)
+    if (typeof completion == "string") {
+        try {
+            completion = JSON.parse(completion)
+        } catch {
+            // Do nothing; we'll just treat it as a string.
+            console.log("Couldn't parse")
+        }
+    }
+    let blocks = []
+    if (typeof completion == "string") {
+        console.log("still string")
+        blocks.push(<ChatBlock text={completion} />)
+    } else if (Array.isArray(completion)) {
+        console.log("Is array")
+        for (let block of completion) {
+            console.log(block)
+            blocks.push(<ChatBlock {...block} />)
+        }
+    } else {
+        blocks.push(<ChatBlock {...completion} />)
+    }
+    console.log(blocks)
+    return blocks
+}
+
diff --git a/src/components/Examples.tsx b/src/components/Examples.tsx
@@ -20,6 +20,7 @@ export default function Examples() {
       imageUrl: "",
       llm: "",
       phone: "",
+      telegramLink: null
     },
   ]);
 
@@ -34,6 +35,7 @@ export default function Examples() {
           imageUrl: entry.imageUrl,
           llm: entry.llm,
           phone: entry.phone,
+          telegramLink: entry.telegramLink
         }));
         setExamples(setme);
       } catch (err) {
@@ -80,7 +82,10 @@ export default function Examples() {
               <dl className="mt-1 flex flex-grow flex-col justify-between">
                 <dt className="sr-only"></dt>
                 <dd className="text-sm text-slate-400">
-                  {example.title}. Running on <b>{example.llm}</b>
+                  {example.title}. Running on <b>{example.llm}</b>.
+                  {example.telegramLink && (
+                    <span className="ml-1"><a onClick={(event) => {event?.stopPropagation(); event?.preventDefault}} href={example.telegramLink}>Chat on <b>Telegram</b></a>.</span>
+                  )}
                 </dd>
               </dl>
               <dl className="mt-1 flex flex-grow flex-col justify-between">
diff --git a/src/components/QAModal.tsx b/src/components/QAModal.tsx
@@ -1,8 +1,9 @@
 "use client";
 
-import { Fragment, useEffect } from "react";
+import {Fragment, useEffect, useState} from "react";
 import { Dialog, Transition } from "@headlessui/react";
 import { useCompletion } from "ai/react";
+import {ChatBlock, responseToChatBlocks} from "@/components/ChatBlock";
 
 var last_name = "";
 
@@ -36,6 +37,17 @@ export default function QAModal({
     headers: { name: example.name },
   });
 
+  let [blocks, setBlocks] = useState<ChatBlock[] | null>(null)
+
+  useEffect(() => {
+    // When the completion changes, parse it to multimodal blocks for display.
+    if (completion) {
+      setBlocks(responseToChatBlocks(completion))
+    } else {
+      setBlocks(null)
+    }
+  }, [completion])
+
   if (!example) {
     console.log("ERROR: no companion selected");
     return null;
@@ -82,7 +94,7 @@ export default function QAModal({
                       className={"w-full flex-auto rounded-md border-0 bg-white/5 px-3.5 py-2 shadow-sm focus:outline-none sm:text-sm sm:leading-6 " + (isLoading && !completion ? "text-gray-600 cursor-not-allowed" : "text-white")}                      
                       value={input}
                       onChange={handleInputChange}
-                      disabled={isLoading && !completion}
+                      disabled={isLoading && !blocks}
                     />
                   </form>
                   <div className="mt-3 sm:mt-5">
@@ -91,13 +103,13 @@ export default function QAModal({
                         Chat with {example.name}
                       </p>
                     </div>
-                    {completion && (
+                    {blocks && (
                       <div className="mt-2">
-                        <p className="text-sm text-gray-200">{completion}</p>
+                        {blocks}
                       </div>
                     )}
 
-                    {isLoading && !completion && (
+                    {isLoading && !blocks && (
                       <p className="flex items-center justify-center mt-4">
                         <svg
                           className="animate-spin -ml-1 mr-3 h-5 w-5 text-white"
diff --git a/src/components/actions.ts b/src/components/actions.ts
@@ -9,7 +9,7 @@ import { parse } from "path";
 export async function getCompanions() {
   const COMPFILE = "./companions/companions.json";
   var companions = [];
-  console.log("Loading companion descriptions from "+COMPFILE);
+  // console.log("Loading companion descriptions from "+COMPFILE);
   var fs = require('fs');
   const data = fs.readFileSync(COMPFILE);
   console.log(String(data));