minimal doc for clip batch and some fixes

rom1504 · rom1504 · commit e6ff7febc357 · 2021-07-07T15:21:12.000+02:00
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1 @@
+.env
diff --git a/README.md b/README.md
@@ -1,2 +1,16 @@
 # clip-retrieval
 Easily computing clip embeddings and building a clip retrieval system with them
+
+## clip batch
+
+First install it by running:
+```
+python3 -m venv .env
+source .env/bin/activate
+pip install -U pip
+pip install clip-by-openai faiss-cpu fire
+```
+
+Then put some images in a `example_folder` and some text with the same name (or use --enable_text=False) then
+* `python clip_batch.py  --dataset_path example_folder --output_folder output_folder`
+
diff --git a/clip_batch.py b/clip_batch.py
@@ -1,4 +1,4 @@
-#!pip install clip-by-openai torch faiss
+#!pip install clip-by-openai faiss-cpu fire
 import torch
 import clip
 from PIL import Image
@@ -29,7 +29,6 @@ def __init__(self,
                  ):
         super().__init__()
         path = Path(folder)
-        self.model = model
         self.enable_text = enable_text
         self.enable_image = enable_image
 
@@ -76,13 +75,14 @@ def __getitem__(self, ind):
             description = descriptions[self.description_index]
             tokenized_text  = self.tokenizer([description[:255]])[0]
 
-        return {"image_tensor": image_tensor, "text_tokens": tokenized_text, "image_path": str(image_file), "text": description}
+        return {"image_tensor": image_tensor, "text_tokens": tokenized_text, "image_filename": str(image_file), "text": description}
     
 
-def main(dataset_path, output_folder, batch_size=256, num_prepro_workers=32, description_index=0, enable_text=True, enable_image=True):
+def main(dataset_path, output_folder, batch_size=256, num_prepro_workers=8, description_index=0, enable_text=True, enable_image=True):
     device = "cuda" if torch.cuda.is_available() else "cpu"
-    model, preprocess = clip.load("ViT-B/32", device=device)
-    os.mkdir(output_folder)
+    model, preprocess = clip.load("ViT-B/32", device=device, jit=False)
+    if not os.path.exists(output_folder):
+        os.mkdir(output_folder)
     data = DataLoader(ImageDataset(preprocess, dataset_path, description_index=description_index, enable_text=enable_text, enable_image=enable_image), \
         batch_size=batch_size, shuffle=False, num_workers=num_prepro_workers, pin_memory=True, prefetch_factor=2)
     if enable_image:
@@ -101,7 +101,7 @@ def main(dataset_path, output_folder, batch_size=256, num_prepro_workers=32, des
             if enable_text:
                 text_features = model.encode_text(item["text_tokens"].cuda())
                 text_embeddings.append(text_features.cpu().numpy())
-                descriptions.extend(item["description"])
+                descriptions.extend(item["text"])
 
     if enable_image:
         img_emb_mat = np.concatenate(image_embeddings)