Traktor/myenv/Lib/site-packages/torchaudio/models/wav2letter.py

from torch import nn, Tensor

__all__ = [
    "Wav2Letter",
]


class Wav2Letter(nn.Module):
    r"""Wav2Letter model architecture from *Wav2Letter: an End-to-End ConvNet-based Speech
    Recognition System* :cite:`collobert2016wav2letter`.

    See Also:
        * `Training example <https://github.com/pytorch/audio/tree/release/0.12/examples/pipeline_wav2letter>`__

    Args:
        num_classes (int, optional): Number of classes to be classified. (Default: ``40``)
        input_type (str, optional): Wav2Letter can use as input: ``waveform``, ``power_spectrum``
         or ``mfcc`` (Default: ``waveform``).
        num_features (int, optional): Number of input features that the network will receive (Default: ``1``).
    """

    def __init__(self, num_classes: int = 40, input_type: str = "waveform", num_features: int = 1) -> None:
        super().__init__()

        acoustic_num_features = 250 if input_type == "waveform" else num_features
        acoustic_model = nn.Sequential(
            nn.Conv1d(in_channels=acoustic_num_features, out_channels=250, kernel_size=48, stride=2, padding=23),
            nn.ReLU(inplace=True),
            nn.Conv1d(in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3),
            nn.ReLU(inplace=True),
            nn.Conv1d(in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3),
            nn.ReLU(inplace=True),
            nn.Conv1d(in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3),
            nn.ReLU(inplace=True),
            nn.Conv1d(in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3),
            nn.ReLU(inplace=True),
            nn.Conv1d(in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3),
            nn.ReLU(inplace=True),
            nn.Conv1d(in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3),
            nn.ReLU(inplace=True),
            nn.Conv1d(in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3),
            nn.ReLU(inplace=True),
            nn.Conv1d(in_channels=250, out_channels=2000, kernel_size=32, stride=1, padding=16),
            nn.ReLU(inplace=True),
            nn.Conv1d(in_channels=2000, out_channels=2000, kernel_size=1, stride=1, padding=0),
            nn.ReLU(inplace=True),
            nn.Conv1d(in_channels=2000, out_channels=num_classes, kernel_size=1, stride=1, padding=0),
            nn.ReLU(inplace=True),
        )

        if input_type == "waveform":
            waveform_model = nn.Sequential(
                nn.Conv1d(in_channels=num_features, out_channels=250, kernel_size=250, stride=160, padding=45),
                nn.ReLU(inplace=True),
            )
            self.acoustic_model = nn.Sequential(waveform_model, acoustic_model)

        if input_type in ["power_spectrum", "mfcc"]:
            self.acoustic_model = acoustic_model

    def forward(self, x: Tensor) -> Tensor:
        r"""
        Args:
            x (torch.Tensor): Tensor of dimension (batch_size, num_features, input_length).

        Returns:
            Tensor: Predictor tensor of dimension (batch_size, number_of_classes, input_length).
        """

        x = self.acoustic_model(x)
        x = nn.functional.log_softmax(x, dim=1)
        return x
losowanie zdjec 2024-05-26 05:12:46 +02:00			`from torch import nn, Tensor`

			`__all__ = [`
			`"Wav2Letter",`
			`]`


			`class Wav2Letter(nn.Module):`
			`r"""Wav2Letter model architecture from *Wav2Letter: an End-to-End ConvNet-based Speech`
			Recognition System* :cite:`collobert2016wav2letter`.

			`See Also:`
			* `Training example <https://github.com/pytorch/audio/tree/release/0.12/examples/pipeline_wav2letter>`__

			`Args:`
			num_classes (int, optional): Number of classes to be classified. (Default: ``40``)
			input_type (str, optional): Wav2Letter can use as input: ``waveform``, ``power_spectrum``
			or ``mfcc`` (Default: ``waveform``).
			num_features (int, optional): Number of input features that the network will receive (Default: ``1``).
			`"""`

			`def __init__(self, num_classes: int = 40, input_type: str = "waveform", num_features: int = 1) -> None:`
			`super().__init__()`

			`acoustic_num_features = 250 if input_type == "waveform" else num_features`
			`acoustic_model = nn.Sequential(`
			`nn.Conv1d(in_channels=acoustic_num_features, out_channels=250, kernel_size=48, stride=2, padding=23),`
			`nn.ReLU(inplace=True),`
			`nn.Conv1d(in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3),`
			`nn.ReLU(inplace=True),`
			`nn.Conv1d(in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3),`
			`nn.ReLU(inplace=True),`
			`nn.Conv1d(in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3),`
			`nn.ReLU(inplace=True),`
			`nn.Conv1d(in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3),`
			`nn.ReLU(inplace=True),`
			`nn.Conv1d(in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3),`
			`nn.ReLU(inplace=True),`
			`nn.Conv1d(in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3),`
			`nn.ReLU(inplace=True),`
			`nn.Conv1d(in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3),`
			`nn.ReLU(inplace=True),`
			`nn.Conv1d(in_channels=250, out_channels=2000, kernel_size=32, stride=1, padding=16),`
			`nn.ReLU(inplace=True),`
			`nn.Conv1d(in_channels=2000, out_channels=2000, kernel_size=1, stride=1, padding=0),`
			`nn.ReLU(inplace=True),`
			`nn.Conv1d(in_channels=2000, out_channels=num_classes, kernel_size=1, stride=1, padding=0),`
			`nn.ReLU(inplace=True),`
			`)`

			`if input_type == "waveform":`
			`waveform_model = nn.Sequential(`
			`nn.Conv1d(in_channels=num_features, out_channels=250, kernel_size=250, stride=160, padding=45),`
			`nn.ReLU(inplace=True),`
			`)`
			`self.acoustic_model = nn.Sequential(waveform_model, acoustic_model)`

			`if input_type in ["power_spectrum", "mfcc"]:`
			`self.acoustic_model = acoustic_model`

			`def forward(self, x: Tensor) -> Tensor:`
			`r"""`
			`Args:`
			`x (torch.Tensor): Tensor of dimension (batch_size, num_features, input_length).`

			`Returns:`
			`Tensor: Predictor tensor of dimension (batch_size, number_of_classes, input_length).`
			`"""`

			`x = self.acoustic_model(x)`
			`x = nn.functional.log_softmax(x, dim=1)`
			`return x`