Skip to content

Commit 1cb00f8

Browse files
committed
docs: add examples + guidance on Realtime API support
1 parent 8829c32 commit 1cb00f8

File tree

5 files changed

+493
-1
lines changed

5 files changed

+493
-1
lines changed

README.md

+61
Original file line numberDiff line numberDiff line change
@@ -258,6 +258,67 @@ We recommend that you always instantiate a client (e.g., with `client = OpenAI()
258258
- It's harder to mock for testing purposes
259259
- It's not possible to control cleanup of network connections
260260

261+
## Realtime API beta
262+
263+
The Realtime API enables you to build low-latency, multi-modal conversational experiences. It currently supports text and audio as both input and output, as well as [function calling](https://platform.openai.com/docs/guides/function-calling) through a WebSocket connection.
264+
265+
Under the hood the SDK uses the [`websockets`](https://websockets.readthedocs.io/en/stable/) library to manage connections.
266+
267+
The Realtime API works through a combination of client-sent events and server-sent events. Clients can send events to do things like update session configuration or send text and audio inputs. Server events confirm when audio responses have completed, or when a text response from the model has been received. A full event reference can be found [here](platform.openai.com/docs/api-reference/realtime-client-events) and a guide can be found [here](https://platform.openai.com/docs/guides/realtime).
268+
269+
Basic text based example:
270+
271+
```py
272+
import asyncio
273+
from openai import AsyncOpenAI
274+
275+
async def main():
276+
client = AsyncOpenAI()
277+
278+
async with client.beta.realtime.connect(model="gpt-4o-realtime-preview-2024-10-01") as connection:
279+
await connection.session.update(session={'modalities': ['text']})
280+
281+
await connection.conversation.item.create(
282+
item={
283+
"type": "message",
284+
"role": "user",
285+
"content": [{"type": "input_text", "text": "Say hello!"}],
286+
}
287+
)
288+
await connection.response.create()
289+
290+
async for event in connection:
291+
if event.type == 'response.text.delta':
292+
print(event.delta, flush=True, end="")
293+
294+
elif event.type == 'response.text.done':
295+
print()
296+
297+
elif event.type == "response.done":
298+
break
299+
300+
asyncio.run(main())
301+
```
302+
303+
However the real magic of the Realtime API is handling audio inputs / outputs, see this example [TUI script](https://github.com/stainless-sdks/openai-python/blob/robert/realtime-docs-preview/examples/realtime/push_to_talk_app.py) for a fully fledged example.
304+
305+
### Realtime error handling
306+
307+
Whenever an error occurs, the Realtime API will send an [`error` event](https://platform.openai.com/docs/guides/realtime/realtime-api-beta#handling-errors) and the connection will stay open and remain usable. This means you need to handle it yourself, as *no errors are raised directly* by the SDK when an `error` event comes in.
308+
309+
```py
310+
client = AsyncOpenAI()
311+
312+
async with client.beta.realtime.connect(model="gpt-4o-realtime-preview-2024-10-01") as connection:
313+
...
314+
async for event in connection:
315+
if event.type == 'error':
316+
print(event.error.type)
317+
print(event.error.code)
318+
print(event.error.event_id)
319+
print(event.error.message)
320+
```
321+
261322
## Using types
262323

263324
Nested request parameters are [TypedDicts](https://docs.python.org/3/library/typing.html#typing.TypedDict). Responses are [Pydantic models](https://docs.pydantic.dev) which also provide helper methods for things like:

examples/realtime/audio_util.py

+142
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,142 @@
1+
from __future__ import annotations
2+
3+
import io
4+
import base64
5+
import asyncio
6+
import threading
7+
from typing import Callable, Awaitable
8+
9+
import numpy as np
10+
import pyaudio
11+
import sounddevice as sd
12+
from pydub import AudioSegment
13+
14+
from openai.resources.beta.realtime.realtime import AsyncRealtimeConnection
15+
16+
CHUNK_LENGTH_S = 0.05 # 100ms
17+
SAMPLE_RATE = 24000
18+
FORMAT = pyaudio.paInt16
19+
CHANNELS = 1
20+
21+
# pyright: reportUnknownMemberType=false, reportUnknownVariableType=false, reportUnknownArgumentType=false
22+
23+
24+
def audio_to_pcm16_base64(audio_bytes: bytes) -> bytes:
25+
# load the audio file from the byte stream
26+
audio = AudioSegment.from_file(io.BytesIO(audio_bytes))
27+
print(f"Loaded audio: {audio.frame_rate=} {audio.channels=} {audio.sample_width=} {audio.frame_width=}")
28+
# resample to 24kHz mono pcm16
29+
pcm_audio = audio.set_frame_rate(SAMPLE_RATE).set_channels(CHANNELS).set_sample_width(2).raw_data
30+
return pcm_audio
31+
32+
33+
class AudioPlayerAsync:
34+
def __init__(self):
35+
self.queue = []
36+
self.lock = threading.Lock()
37+
self.stream = sd.OutputStream(
38+
callback=self.callback,
39+
samplerate=SAMPLE_RATE,
40+
channels=CHANNELS,
41+
dtype=np.int16,
42+
blocksize=int(CHUNK_LENGTH_S * SAMPLE_RATE),
43+
)
44+
self.playing = False
45+
self._frame_count = 0
46+
47+
def callback(self, outdata, frames, time, status): # noqa
48+
with self.lock:
49+
data = np.empty(0, dtype=np.int16)
50+
51+
# get next item from queue if there is still space in the buffer
52+
while len(data) < frames and len(self.queue) > 0:
53+
item = self.queue.pop(0)
54+
frames_needed = frames - len(data)
55+
data = np.concatenate((data, item[:frames_needed]))
56+
if len(item) > frames_needed:
57+
self.queue.insert(0, item[frames_needed:])
58+
59+
self._frame_count += len(data)
60+
61+
# fill the rest of the frames with zeros if there is no more data
62+
if len(data) < frames:
63+
data = np.concatenate((data, np.zeros(frames - len(data), dtype=np.int16)))
64+
65+
outdata[:] = data.reshape(-1, 1)
66+
67+
def reset_frame_count(self):
68+
self._frame_count = 0
69+
70+
def get_frame_count(self):
71+
return self._frame_count
72+
73+
def add_data(self, data: bytes):
74+
with self.lock:
75+
# bytes is pcm16 single channel audio data, convert to numpy array
76+
np_data = np.frombuffer(data, dtype=np.int16)
77+
self.queue.append(np_data)
78+
if not self.playing:
79+
self.start()
80+
81+
def start(self):
82+
self.playing = True
83+
self.stream.start()
84+
85+
def stop(self):
86+
self.playing = False
87+
self.stream.stop()
88+
with self.lock:
89+
self.queue = []
90+
91+
def terminate(self):
92+
self.stream.close()
93+
94+
95+
async def send_audio_worker_sounddevice(
96+
connection: AsyncRealtimeConnection,
97+
should_send: Callable[[], bool] | None = None,
98+
start_send: Callable[[], Awaitable[None]] | None = None,
99+
):
100+
sent_audio = False
101+
102+
device_info = sd.query_devices()
103+
print(device_info)
104+
105+
read_size = int(SAMPLE_RATE * 0.02)
106+
107+
stream = sd.InputStream(
108+
channels=CHANNELS,
109+
samplerate=SAMPLE_RATE,
110+
dtype="int16",
111+
)
112+
stream.start()
113+
114+
try:
115+
while True:
116+
if stream.read_available < read_size:
117+
await asyncio.sleep(0)
118+
continue
119+
120+
data, _ = stream.read(read_size)
121+
122+
if should_send() if should_send else True:
123+
if not sent_audio and start_send:
124+
await start_send()
125+
await connection.send(
126+
{"type": "input_audio_buffer.append", "audio": base64.b64encode(data).decode("utf-8")}
127+
)
128+
sent_audio = True
129+
130+
elif sent_audio:
131+
print("Done, triggering inference")
132+
await connection.send({"type": "input_audio_buffer.commit"})
133+
await connection.send({"type": "response.create", "response": {}})
134+
sent_audio = False
135+
136+
await asyncio.sleep(0)
137+
138+
except KeyboardInterrupt:
139+
pass
140+
finally:
141+
stream.stop()
142+
stream.close()

0 commit comments

Comments
 (0)