add xi_vector (#404)

mrjunjieli · web-flow · commit 3bba9bd2b8ed · 2025-02-23T23:08:39.000+08:00
* add xi_vector

* fix flake8 error

* fix flake8 error

* fix flake8 error

* update Readme

* fix lint errors
diff --git a/README.md b/README.md
@@ -60,7 +60,7 @@ pre-commit install  # for clean and tidy code
 ```
 
 ## 🔥 News
-
+* 2025.02.23: Add support for the Xi-vector, see [#404](https://github.com/wenet-e2e/wespeaker/pull/404).
 * 2024.09.03: Support the SimAM_ResNet and the model pretrained on VoxBlink2, check [Pretrained Models](docs/pretrained.md) for the pretrained model, [VoxCeleb Recipe](https://github.com/wenet-e2e/wespeaker/tree/master/examples/voxceleb/v2) for the super performance, and [python usage](docs/python_package.md) for the command line usage!
 * 2024.08.30: We support whisper_encoder based frontend and propose the [Whisper-PMFA](https://arxiv.org/pdf/2408.15585) framework, check [#356](https://github.com/wenet-e2e/wespeaker/pull/356).
 * 2024.08.20: Update diarization recipe for VoxConverse dataset by leveraging umap dimensionality reduction and hdbscan clustering, see [#347](https://github.com/wenet-e2e/wespeaker/pull/347) and [#352](https://github.com/wenet-e2e/wespeaker/pull/352).
diff --git a/examples/voxceleb/v2/README.md b/examples/voxceleb/v2/README.md
@@ -62,6 +62,8 @@
 | SimAM_ResNet100 (VoxBlink2 Pretrain)       | 50.2M |       | √ | x | × | 0.229 | 0.458 | 0.868 |
 |                      |       |       | √ | √ | × | 0.207 | 0.424 | 0.804 |
 |                      |       |       | √ | √ | √ | 0.202 | 0.421 | 0.795 |
+| XI_VEC_ECAPA_TDNN_c512       | 5.9M | 0.68G      | x | x | × | 0.995 | 1.130 | 2.169 |
+|                  |       |       | × | √ | × | 0.883 | 1.056 | 1.976 |
 
 
 ## PLDA results
diff --git a/examples/voxceleb/v2/conf/xi_vector.yaml b/examples/voxceleb/v2/conf/xi_vector.yaml
@@ -0,0 +1,83 @@
+### train configuraton
+
+exp_dir: exp/XI_VEC_ECAPA_TDNN_c512-emb192-fbank80-num_frms200-aug0.6-spTrue-saFalse-ArcMargin-SGD-epoch150
+gpus: "[0]"
+num_avg: 10
+enable_amp: False # whether enable automatic mixed precision training
+
+seed: 42
+num_epochs: 150
+save_epoch_interval: 5 # save model every 5 epochs
+log_batch_interval: 100 # log every 100 batchs
+
+dataloader_args:
+  batch_size: 512
+  num_workers: 16
+  pin_memory: False
+  prefetch_factor: 8
+  drop_last: True
+
+dataset_args:
+  # the sample number which will be traversed within one epoch, if the value equals to 0,
+  # the utterance number in the dataset will be used as the sample_num_per_epoch.
+  sample_num_per_epoch: 0
+  shuffle: True
+  shuffle_args:
+    shuffle_size: 2500
+  filter: True
+  filter_args:
+    min_num_frames: 100
+    max_num_frames: 800
+  resample_rate: 16000
+  speed_perturb: True
+  num_frms: 200
+  aug_prob: 0.6 # prob to add reverb & noise aug per sample
+  frontend: "fbank" # fbank, s3prl
+  fbank_args:
+    num_mel_bins: 80
+    frame_shift: 10
+    frame_length: 25
+    dither: 1.0
+  spec_aug: False
+  spec_aug_args:
+    num_t_mask: 1
+    num_f_mask: 1
+    max_t: 10
+    max_f: 8
+    prob: 0.6
+
+model: XI_VEC_ECAPA_TDNN_c512 # XI_VEC_ECAPA_TDNN_c512, XI_VEC_ECAPA_TDNN_c1024, XI_VEC_XVEC
+model_init: null
+model_args:
+  feat_dim: 80
+  embed_dim: 192
+  pooling_func: "XI" # the default pooling_func in ECAPA_TDNN is ASTP
+projection_args:
+  project_type: "arc_margin" # add_margin, arc_margin, sphere, softmax
+  scale: 32.0
+  easy_margin: False
+
+margin_scheduler: MarginScheduler
+margin_update:
+  initial_margin: 0.0
+  final_margin: 0.2
+  increase_start_epoch: 20
+  fix_start_epoch: 40
+  update_margin: True
+  increase_type: "exp" # exp, linear
+
+loss: CrossEntropyLoss
+loss_args: {}
+
+optimizer: SGD
+optimizer_args:
+  momentum: 0.9
+  nesterov: True
+  weight_decay: 0.0001
+
+scheduler: ExponentialDecrease
+scheduler_args:
+  initial_lr: 0.1
+  final_lr: 0.00005
+  warm_up_epoch: 6
+  warm_from_zero: True
diff --git a/wespeaker/models/ecapa_tdnn.py b/wespeaker/models/ecapa_tdnn.py
@@ -281,7 +281,7 @@ def ECAPA_TDNN_GLOB_c512(feat_dim,
                                  pooling_func='ASTP')
     model.eval()
     out = model(x)
-    print(out.shape)
+    print(out[-1].shape)
 
     num_params = sum(param.numel() for param in model.parameters())
     print("{} M".format(num_params / 1e6))
diff --git a/wespeaker/models/pooling_layers.py b/wespeaker/models/pooling_layers.py
@@ -310,6 +310,77 @@ def get_out_dim(self):
         return self.out_dim
 
 
+class XI(torch.nn.Module):
+    def __init__(self, in_dim, hidden_size=256, stddev=False,
+                 train_mean=True, train_prec=True, **kwargs):
+        super(XI, self).__init__()
+        self.input_dim = in_dim
+        self.stddev = stddev
+        if self.stddev:
+            self.output_dim = 2 * self.input_dim
+        else:
+            self.output_dim = self.input_dim
+        self.prior_mean = torch.nn.Parameter(torch.zeros(1, self.input_dim),
+                                             requires_grad=train_mean)
+        self.prior_logprec = torch.nn.Parameter(torch.zeros(1, self.input_dim),
+                                                requires_grad=train_prec)
+        self.softmax = torch.nn.Softmax(dim=2)
+
+        # Log-precision estimator
+        self.lin1_relu_bn = nn.Sequential(
+            nn.Conv1d(self.input_dim, hidden_size,
+                      kernel_size=1, stride=1, bias=True),
+            nn.ReLU(inplace=True),
+            nn.BatchNorm1d(hidden_size))
+        self.lin2 = nn.Conv1d(hidden_size, self.input_dim, kernel_size=1,
+                              stride=1, bias=True)
+        self.softplus2 = torch.nn.Softplus(beta=1, threshold=20)
+
+    def forward(self, inputs):
+        """
+        @inputs: a 3-dimensional tensor (a batch),
+        including [samples-index, frames-dim-index, frames-index]
+        """
+        assert len(inputs.shape) == 3
+        assert inputs.shape[1] == self.input_dim
+        feat = inputs
+        # Log-precision estimator
+        # frame precision estimate
+        logprec = self.softplus2(self.lin2(self.lin1_relu_bn(feat)))
+
+        # Square and take log before softmax
+        logprec = 2.0 * torch.log(logprec)
+        # Gaussian Posterior Inference
+        # Option 1: a_o (prior_mean-phi) included in variance
+        weight_attn = self.softmax(
+            torch.cat(
+                (logprec,
+                 self.prior_logprec.repeat(
+                     logprec.shape[0], 1).unsqueeze(dim=2)), 2))
+        # Posterior precision
+        Ls = torch.sum(torch.exp(torch.cat(
+            (logprec, self.prior_logprec.repeat(
+                logprec.shape[0], 1).unsqueeze(dim=2)), 2)), dim=2)
+        # Posterior mean
+        phi = torch.sum(torch.cat(
+            (feat, self.prior_mean.repeat(
+                feat.shape[0], 1).unsqueeze(dim=2)), 2) * weight_attn, dim=2)
+
+        if self.stddev:
+            sigma2 = torch.sum(torch.cat((
+                feat, self.prior_mean.repeat(
+                    feat.shape[0], 1).unsqueeze(dim=2)), 2).pow(2) * weight_attn, dim=2)
+            sigma = torch.sqrt(torch.clamp(sigma2 - phi ** 2, min=1.0e-12))
+            return torch.cat((phi, sigma), dim=1).unsqueeze(dim=2)
+        else:
+            return phi
+
+    def get_out_dim(self):
+        return self.output_dim
+
+    def get_prior(self):
+        return self.prior_mean, self.prior_logprec
+
 if __name__ == '__main__':
     data = torch.randn(16, 512, 10, 35)
     # model = StatisticsPooling()
diff --git a/wespeaker/models/speaker_model.py b/wespeaker/models/speaker_model.py
@@ -23,6 +23,7 @@
 import wespeaker.models.whisper_PMFA as whisper_PMFA
 import wespeaker.models.redimnet as redimnet
 import wespeaker.models.samresnet as samresnet
+import wespeaker.models.xi_vector as xi_vector
 
 
 
@@ -49,6 +50,8 @@ def get_speaker_model(model_name: str):
         return getattr(redimnet, model_name)
     elif model_name.startswith("SimAM_ResNet"):
         return getattr(samresnet, model_name)
+    elif model_name.startswith("XI_VEC"):
+        return getattr(xi_vector, model_name)
     else:  # model_name error !!!
         print(model_name + " not found !!!")
         exit(1)
diff --git a/wespeaker/models/xi_vector.py b/wespeaker/models/xi_vector.py
@@ -0,0 +1,65 @@
+# Copyright (c) 2025 Shuai Wang (wsstriving@gmail.com)
+#               2025 Junjie LI (junjie98.li@connect.polyu.hk)
+#               2025 Tianchi Liu (tianchi_liu@u.nus.edu)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+'''The implementation of Xi_vector.
+
+Reference:
+[1] Lee, K. A., Wang, Q., & Koshinaka, T. (2021). Xi-vector embedding
+for speaker recognition. IEEE Signal Processing Letters, 28, 1385-1389.
+'''
+
+
+import torch
+import wespeaker.models.ecapa_tdnn as ecapa_tdnn
+import wespeaker.models.tdnn as tdnn
+
+
+
+
+def XI_VEC_ECAPA_TDNN_c1024(feat_dim, embed_dim, pooling_func='XI', emb_bn=False):
+    return ecapa_tdnn.ECAPA_TDNN(channels=1024,
+                                 feat_dim=feat_dim,
+                                 embed_dim=embed_dim,
+                                 pooling_func=pooling_func,
+                                 emb_bn=emb_bn)
+
+
+def XI_VEC_ECAPA_TDNN_c512(feat_dim, embed_dim, pooling_func='XI', emb_bn=False):
+    return ecapa_tdnn.ECAPA_TDNN(channels=512,
+                                 feat_dim=feat_dim,
+                                 embed_dim=embed_dim,
+                                 pooling_func=pooling_func,
+                                 emb_bn=emb_bn)
+
+
+
+def XI_VEC_XVEC(feat_dim, embed_dim, pooling_func='XI'):
+    return tdnn.XVEC(feat_dim=feat_dim, embed_dim=embed_dim, pooling_func=pooling_func)
+
+
+if __name__ == '__main__':
+    x = torch.rand(1, 200, 80)
+    model = XI_VEC_XVEC(feat_dim=80, embed_dim=512, pooling_func='XI')
+    model.eval()
+    y = model(x)
+    print(y[-1].size())
+
+    num_params = sum(p.numel() for p in model.parameters())
+    print("{} M".format(num_params / 1e6))
+
+    from thop import profile
+    x_np = torch.randn(1, 200, 80)
+    flops, params = profile(model, inputs=(x_np, ))
+    print("FLOPs: {} G, Params: {} M".format(flops / 1e9, params / 1e6))