Derive compatible compute type in the case of CPU use (#383)

3manifold · web-flow · commit f25ba3232340 · 2025-03-03T20:25:25.000+08:00
* Fix default compute type for CPU

* Update main.py

* Rerun code formatter
diff --git a/models/tta/picoaudio/picoaudio/audioldm/clap/open_clip/model.py b/models/tta/picoaudio/picoaudio/audioldm/clap/open_clip/model.py
@@ -1,4 +1,4 @@
-""" CLAP Model
+"""CLAP Model
 
 Adapted from CLIP: https://github.com/openai/CLIP. Originally MIT License, Copyright (c) 2021 OpenAI.
 Adapted to the Audio Task.
diff --git a/models/tta/picoaudio/picoaudio/audioldm/clap/open_clip/openai.py b/models/tta/picoaudio/picoaudio/audioldm/clap/open_clip/openai.py
@@ -1,4 +1,4 @@
-""" OpenAI pretrained model functions
+"""OpenAI pretrained model functions
 
 Adapted from https://github.com/openai/CLIP. Originally MIT License, Copyright (c) 2021 OpenAI.
 """
diff --git a/models/tta/picoaudio/picoaudio/audioldm/clap/open_clip/timm_model.py b/models/tta/picoaudio/picoaudio/audioldm/clap/open_clip/timm_model.py
@@ -1,4 +1,4 @@
-""" timm model adapter
+"""timm model adapter
 
 Wraps timm (https://github.com/rwightman/pytorch-image-models) models for use as a vision tower in CLIP model.
 """
diff --git a/models/tta/picoaudio/picoaudio/audioldm/clap/open_clip/tokenizer.py b/models/tta/picoaudio/picoaudio/audioldm/clap/open_clip/tokenizer.py
@@ -1,4 +1,4 @@
-""" CLIP tokenizer
+"""CLIP tokenizer
 
 Copied from https://github.com/openai/CLIP. Originally MIT License, Copyright (c) 2021 OpenAI.
 """
diff --git a/models/tts/debatts/utils/g2p/english.py b/models/tts/debatts/utils/g2p/english.py
@@ -1,4 +1,4 @@
-""" from https://github.com/keithito/tacotron """
+"""from https://github.com/keithito/tacotron"""
 
 import re
 from unidecode import unidecode
diff --git a/models/tts/valle_v2/modeling_llama.py b/models/tts/valle_v2/modeling_llama.py
@@ -23,7 +23,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch LLaMA model."""
+"""PyTorch LLaMA model."""
 import math
 from typing import List, Optional, Tuple, Union
 
diff --git a/preprocessors/Emilia/main.py b/preprocessors/Emilia/main.py
@@ -509,6 +509,9 @@ def main_process(audio_path, save_path=None, audio_name=None):
         logger.info("Using CPU")
         device_name = "cpu"
         device = torch.device(device_name)
+        # whisperX expects compute type: int8
+        logger.info("Overriding the compute type to int8")
+        args.compute_type = "int8"
 
     check_env(logger)
 
diff --git a/processors/audio_features_extractor.py b/processors/audio_features_extractor.py
@@ -11,7 +11,7 @@
 1. Acoustic features such as Mel Spectrogram, F0, Energy, etc.
 2. Content features such as phonetic posteriorgrams (PPG) and bottleneck features (BNF) from pretrained models
 
-Note: 
+Note:
 All the features extraction are designed to utilize GPU to the maximum extent, which can ease the on-the-fly extraction for large-scale dataset.
 
 """
diff --git a/processors/descriptive_text_features_extractor.py b/processors/descriptive_text_features_extractor.py
@@ -11,7 +11,7 @@
 The common descriptive text features include:
 1. Global semantic guidance features that extracted some pretrained text models like T5. It can be adopted to TTA, TTM, etc.
 
-Note: 
+Note:
 All the features extraction are designed to utilize GPU to the maximum extent, which can ease the on-the-fly extraction for large-scale dataset.
 
 """
diff --git a/processors/text_features_extractor.py b/processors/text_features_extractor.py
@@ -11,7 +11,7 @@
 The common text features include:
 1. phone features that are used for TTS, SVS, etc.
 
-Note: 
+Note:
 All the features extraction are designed to utilize GPU to the maximum extent, which can ease the on-the-fly extraction for large-scale dataset.
 
 """
diff --git a/text/__init__.py b/text/__init__.py
@@ -3,7 +3,7 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
-""" This code is modified from https://github.com/keithito/tacotron """
+"""This code is modified from https://github.com/keithito/tacotron"""
 import re
 from text import cleaners
 from text.symbols import symbols
diff --git a/text/cleaners.py b/text/cleaners.py
@@ -3,7 +3,7 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
-""" This code is modified from https://github.com/keithito/tacotron """
+"""This code is modified from https://github.com/keithito/tacotron"""
 
 """
 Cleaners are transformations that run over the input text at both training and eval time.
diff --git a/text/cmudict.py b/text/cmudict.py
@@ -3,7 +3,7 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
-"""  This code is modified from https://github.com/keithito/tacotron """
+"""This code is modified from https://github.com/keithito/tacotron"""
 
 import re
 
diff --git a/text/numbers.py b/text/numbers.py
@@ -3,7 +3,7 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
-""" This code is modified from https://github.com/keithito/tacotron """
+"""This code is modified from https://github.com/keithito/tacotron"""
 
 import inflect
 import re
diff --git a/text/symbols.py b/text/symbols.py
@@ -3,7 +3,7 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
-""" This code is modified from https://github.com/keithito/tacotron """
+"""This code is modified from https://github.com/keithito/tacotron"""
 
 """
 Defines the set of symbols used in text input to the model.
diff --git a/utils/cut_by_vad.py b/utils/cut_by_vad.py
@@ -3,7 +3,7 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
-""" This code is modified from https://github.com/facebookresearch/libri-light/blob/main/data_preparation/cut_by_vad.py"""
+"""This code is modified from https://github.com/facebookresearch/libri-light/blob/main/data_preparation/cut_by_vad.py"""
 import pathlib
 import soundfile as sf
 import numpy as np
diff --git a/utils/mfa_prepare.py b/utils/mfa_prepare.py
@@ -3,7 +3,7 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
-""" This code is modified from https://montreal-forced-aligner.readthedocs.io/en/latest/user_guide/performance.html"""
+"""This code is modified from https://montreal-forced-aligner.readthedocs.io/en/latest/user_guide/performance.html"""
 
 import os
 import subprocess

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-""" CLAP Model`
	`1`	`+"""CLAP Model`
`2`	`2`
`3`	`3`	`Adapted from CLIP: https://github.com/openai/CLIP. Originally MIT License, Copyright (c) 2021 OpenAI.`
`4`	`4`	`Adapted to the Audio Task.`
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-""" OpenAI pretrained model functions`
	`1`	`+"""OpenAI pretrained model functions`
`2`	`2`
`3`	`3`	`Adapted from https://github.com/openai/CLIP. Originally MIT License, Copyright (c) 2021 OpenAI.`
`4`	`4`	`"""`
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-""" timm model adapter`
	`1`	`+"""timm model adapter`
`2`	`2`
`3`	`3`	`Wraps timm (https://github.com/rwightman/pytorch-image-models) models for use as a vision tower in CLIP model.`
`4`	`4`	`"""`
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-""" CLIP tokenizer`
	`1`	`+"""CLIP tokenizer`
`2`	`2`
`3`	`3`	`Copied from https://github.com/openai/CLIP. Originally MIT License, Copyright (c) 2021 OpenAI.`
`4`	`4`	`"""`
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-""" from https://github.com/keithito/tacotron """`
	`1`	`+"""from https://github.com/keithito/tacotron"""`
`2`	`2`
`3`	`3`	`import re`
`4`	`4`	`from unidecode import unidecode`