Merge pull request #155 from NotJoeMartinez/embedding_text_splitter

NotJoeMartinez · web-flow · commit 59fe92a6d493 · 2024-07-06T14:46:04.000-05:00
Embedding text splitter
diff --git a/.gitignore b/.gitignore
@@ -173,4 +173,5 @@ UCYO_jab_esuFRV4b17AJtAw
 .ignore
 .ignore/
 tests/test_data/
-.idea
+.idea
+*.sh
diff --git a/README.md b/README.md
@@ -96,22 +96,25 @@ This requires an OpenAI API key set in the environment variable `OPENAI_API_KEY`
 you can pass the key with the `--openai-api-key` flag. 
 
 
-## `get-embedings`
+## `embeddings`
 Fetches OpenAI embeddings for specified channel
 ```bash
 
 # make sure openAI key is set
 # export OPENAI_API_KEY="[yourOpenAIKey]"
 
-yt-fts get-embeddings --channel "3Blue1Brown"
+yt-fts embeddings --channel "3Blue1Brown"
+
+# specify time interval in seconds to split text by default is 10 
+yt-fts embeddings --interval 60 --channel "3Blue1Brown" 
 ```
 
 After the embeddings are saved you will see a `(ss)` next to the channel name when you 
 list channels and you will be able to use the `vsearch` command for that channel. 
 
 ## `vsearch` (Semantic Search)
 `vsearch` is for "Vector search". This requires that you enable semantic 
-search for a channel with `get-embeddings`. It has the same options as 
+search for a channel with `embeddings`. It has the same options as 
 `search` but output will be sorted by similarity to the search string and 
 the default return limit is 10. 
 
diff --git a/tests/basic.sh b/tests/basic.sh
diff --git a/tests/view_chromadb.py b/tests/view_chromadb.py
@@ -1,7 +1,7 @@
 import chromadb
 import sys
 from openai import OpenAI
-from yt_fts.embeddings import get_embedding
+from yt_fts.get_embeddings import get_embedding
 from yt_fts.config import get_or_make_chroma_path
 from yt_fts.utils import time_to_secs
 from yt_fts.db_utils import get_channel_name_from_video_id, get_title_from_db
diff --git a/yt_fts/get_embeddings.py b/yt_fts/get_embeddings.py
diff --git a/yt_fts/list.py b/yt_fts/list.py
@@ -110,7 +110,7 @@ def list_channels(channel_id=None):
     console.print("")
 
 
-#  not dry but for some reason importing from embeddings.py causes slow down 
+#  not dry but for some reason importing from get_embeddings.py causes slow down
 def check_ss_enabled(channel_id=None):
     from yt_fts.config import get_db_path
 
diff --git a/yt_fts/utils.py b/yt_fts/utils.py
@@ -145,7 +145,7 @@ def enable_ss(channel_id):
     con.close()
 
 
-def split_subtitles(video_id):
+def split_subtitles(video_id, interval=60):
     from datetime import datetime
     from .db_utils import get_subs_by_video_id
 
@@ -172,8 +172,13 @@ def time_to_seconds(time_str):
 
     interval_texts = {}
     for start, start_time_str, text in converted_data:
-        interval = int(start // 10) * 10
-        key = interval_texts.setdefault(interval, {'start_time': start_time_str, 'texts': []})
+        split_interval = int(start // interval) * interval
+
+        key = interval_texts.setdefault(split_interval, {
+            'start_time': start_time_str,
+            'texts': []
+        })
+
         key['texts'].append(text)
 
     result = [(data['start_time'], ' '.join(data['texts']).strip()) for data in interval_texts.values()]
diff --git a/yt_fts/vector_search.py b/yt_fts/vector_search.py
@@ -4,7 +4,7 @@
 from sqlite_utils import Database
 
 from .utils import time_to_secs, bold_query_matches
-from .embeddings import get_embedding
+from .get_embeddings import get_embedding
 from .config import get_chroma_client
 from .db_utils import (
     get_channel_name_from_video_id,
diff --git a/yt_fts/yt_fts.py b/yt_fts/yt_fts.py
@@ -166,7 +166,7 @@ def delete(channel):
     console.print("[bold]Are you sure you want to delete this channel and all its data?[/bold]")
     confirm = input("(Y/n): ")
 
-    if confirm == "y":
+    if confirm.lower() == "y":
         delete_channel(channel_id)
         print(f"Deleted channel {channel_name}: {channel_url}")
     else:
@@ -303,17 +303,23 @@ def vsearch(text, channel, video, limit, export, openai_api_key):
 @cli.command(
     help="""
     Generate embeddings for a channel using OpenAI's embeddings API.
-
     Requires an OpenAI API key to be set as an environment variable OPENAI_API_KEY.
     """
 )
-@click.option("-c", "--channel", default=None, help="The name or id of the channel to generate embeddings for")
-@click.option("--openai-api-key", default=None,
-              help="OpenAI API key. If not provided, the script will attempt to read it from the OPENAI_API_KEY "
-                   "environment variable.")
-def get_embeddings(channel, openai_api_key):
+@click.option("-c", "--channel",
+              default=None,
+              help="The name or id of the channel to generate embeddings for")
+@click.option("--openai-api-key",
+              default=None,
+              help="OpenAI API key. If not provided, the script will attempt to read it from"
+                   " the OPENAI_API_KEY environment variable.")
+@click.option("-i", "--interval",
+              default=10,
+              type=int,
+              help="Interval in seconds to split the transcripts into chunks")
+def embeddings(channel, openai_api_key, interval=10):
     from yt_fts.db_utils import get_vid_ids_by_channel_id
-    from yt_fts.embeddings import add_embeddings_to_chroma
+    from yt_fts.get_embeddings import add_embeddings_to_chroma
     from yt_fts.utils import split_subtitles, check_ss_enabled, enable_ss
     from openai import OpenAI
 
@@ -342,7 +348,7 @@ def get_embeddings(channel, openai_api_key):
 
     channel_subs = []
     for vid_id in channel_video_ids:
-        split_subs = split_subtitles(vid_id[0])
+        split_subs = split_subtitles(vid_id[0], interval=interval)
         if split_subs is None:
             continue
         for sub in split_subs: