import os from functools import lru_cache from typing import BinaryIO, Dict, Optional, Tuple, Type, Union import torch from torchaudio._extension import lazy_import_sox_ext from torchaudio.io import CodecConfig from torio._extension import lazy_import_ffmpeg_ext from . import soundfile_backend from .backend import Backend from .common import AudioMetaData from .ffmpeg import FFmpegBackend from .soundfile import SoundfileBackend from .sox import SoXBackend @lru_cache(None) def get_available_backends() -> Dict[str, Type[Backend]]: backend_specs: Dict[str, Type[Backend]] = {} if lazy_import_ffmpeg_ext().is_available(): backend_specs["ffmpeg"] = FFmpegBackend if lazy_import_sox_ext().is_available(): backend_specs["sox"] = SoXBackend if soundfile_backend._IS_SOUNDFILE_AVAILABLE: backend_specs["soundfile"] = SoundfileBackend return backend_specs def get_backend(backend_name, backends) -> Backend: if backend := backends.get(backend_name): return backend else: raise ValueError( f"Unsupported backend '{backend_name}' specified; ", f"please select one of {list(backends.keys())} instead.", ) def get_info_func(): backends = get_available_backends() def dispatcher( uri: Union[BinaryIO, str, os.PathLike], format: Optional[str], backend_name: Optional[str] ) -> Backend: if backend_name is not None: return get_backend(backend_name, backends) for backend in backends.values(): if backend.can_decode(uri, format): return backend raise RuntimeError(f"Couldn't find appropriate backend to handle uri {uri} and format {format}.") def info( uri: Union[BinaryIO, str, os.PathLike], format: Optional[str] = None, buffer_size: int = 4096, backend: Optional[str] = None, ) -> AudioMetaData: """Get signal information of an audio file. Note: When the input type is file-like object, this function cannot get the correct length (``num_samples``) for certain formats, such as ``vorbis``. In this case, the value of ``num_samples`` is ``0``. Args: uri (path-like object or file-like object): Source of audio data. The following types are accepted: * ``path-like``: File path or URL. * ``file-like``: Object with ``read(size: int) -> bytes`` method, which returns byte string of at most ``size`` length. format (str or None, optional): If not ``None``, interpreted as hint that may allow backend to override the detected format. (Default: ``None``) buffer_size (int, optional): Size of buffer to use when processing file-like objects, in bytes. (Default: ``4096``) backend (str or None, optional): I/O backend to use. If ``None``, function selects backend given input and available backends. Otherwise, must be one of [``"ffmpeg"``, ``"sox"``, ``"soundfile"``], with the corresponding backend available. (Default: ``None``) .. seealso:: :ref:`backend` Returns: AudioMetaData """ backend = dispatcher(uri, format, backend) return backend.info(uri, format, buffer_size) return info def get_load_func(): backends = get_available_backends() def dispatcher( uri: Union[BinaryIO, str, os.PathLike], format: Optional[str], backend_name: Optional[str] ) -> Backend: if backend_name is not None: return get_backend(backend_name, backends) for backend in backends.values(): if backend.can_decode(uri, format): return backend raise RuntimeError(f"Couldn't find appropriate backend to handle uri {uri} and format {format}.") def load( uri: Union[BinaryIO, str, os.PathLike], frame_offset: int = 0, num_frames: int = -1, normalize: bool = True, channels_first: bool = True, format: Optional[str] = None, buffer_size: int = 4096, backend: Optional[str] = None, ) -> Tuple[torch.Tensor, int]: """Load audio data from source. By default (``normalize=True``, ``channels_first=True``), this function returns Tensor with ``float32`` dtype, and the shape of `[channel, time]`. Note: The formats this function can handle depend on the availability of backends. Please use the following functions to fetch the supported formats. - FFmpeg: :py:func:`torchaudio.utils.ffmpeg_utils.get_audio_decoders` - Sox: :py:func:`torchaudio.utils.sox_utils.list_read_formats` - SoundFile: Refer to `the official document `__. .. warning:: ``normalize`` argument does not perform volume normalization. It only converts the sample type to `torch.float32` from the native sample type. When the input format is WAV with integer type, such as 32-bit signed integer, 16-bit signed integer, 24-bit signed integer, and 8-bit unsigned integer, by providing ``normalize=False``, this function can return integer Tensor, where the samples are expressed within the whole range of the corresponding dtype, that is, ``int32`` tensor for 32-bit signed PCM, ``int16`` for 16-bit signed PCM and ``uint8`` for 8-bit unsigned PCM. Since torch does not support ``int24`` dtype, 24-bit signed PCM are converted to ``int32`` tensors. ``normalize`` argument has no effect on 32-bit floating-point WAV and other formats, such as ``flac`` and ``mp3``. For these formats, this function always returns ``float32`` Tensor with values. Args: uri (path-like object or file-like object): Source of audio data. frame_offset (int, optional): Number of frames to skip before start reading data. num_frames (int, optional): Maximum number of frames to read. ``-1`` reads all the remaining samples, starting from ``frame_offset``. This function may return the less number of frames if there is not enough frames in the given file. normalize (bool, optional): When ``True``, this function converts the native sample type to ``float32``. Default: ``True``. If input file is integer WAV, giving ``False`` will change the resulting Tensor type to integer type. This argument has no effect for formats other than integer WAV type. channels_first (bool, optional): When True, the returned Tensor has dimension `[channel, time]`. Otherwise, the returned Tensor's dimension is `[time, channel]`. format (str or None, optional): If not ``None``, interpreted as hint that may allow backend to override the detected format. (Default: ``None``) buffer_size (int, optional): Size of buffer to use when processing file-like objects, in bytes. (Default: ``4096``) backend (str or None, optional): I/O backend to use. If ``None``, function selects backend given input and available backends. Otherwise, must be one of [``"ffmpeg"``, ``"sox"``, ``"soundfile"``], with the corresponding backend being available. (Default: ``None``) .. seealso:: :ref:`backend` Returns: (torch.Tensor, int): Resulting Tensor and sample rate. If the input file has integer wav format and normalization is off, then it has integer type, else ``float32`` type. If ``channels_first=True``, it has `[channel, time]` else `[time, channel]`. """ backend = dispatcher(uri, format, backend) return backend.load(uri, frame_offset, num_frames, normalize, channels_first, format, buffer_size) return load def get_save_func(): backends = get_available_backends() def dispatcher( uri: Union[BinaryIO, str, os.PathLike], format: Optional[str], backend_name: Optional[str] ) -> Backend: if backend_name is not None: return get_backend(backend_name, backends) for backend in backends.values(): if backend.can_encode(uri, format): return backend raise RuntimeError(f"Couldn't find appropriate backend to handle uri {uri} and format {format}.") def save( uri: Union[BinaryIO, str, os.PathLike], src: torch.Tensor, sample_rate: int, channels_first: bool = True, format: Optional[str] = None, encoding: Optional[str] = None, bits_per_sample: Optional[int] = None, buffer_size: int = 4096, backend: Optional[str] = None, compression: Optional[Union[CodecConfig, float, int]] = None, ): """Save audio data to file. Note: The formats this function can handle depend on the availability of backends. Please use the following functions to fetch the supported formats. - FFmpeg: :py:func:`torchaudio.utils.ffmpeg_utils.get_audio_encoders` - Sox: :py:func:`torchaudio.utils.sox_utils.list_write_formats` - SoundFile: Refer to `the official document `__. Args: uri (str or pathlib.Path): Path to audio file. src (torch.Tensor): Audio data to save. must be 2D tensor. sample_rate (int): sampling rate channels_first (bool, optional): If ``True``, the given tensor is interpreted as `[channel, time]`, otherwise `[time, channel]`. format (str or None, optional): Override the audio format. When ``uri`` argument is path-like object, audio format is inferred from file extension. If the file extension is missing or different, you can specify the correct format with this argument. When ``uri`` argument is file-like object, this argument is required. Valid values are ``"wav"``, ``"ogg"``, and ``"flac"``. encoding (str or None, optional): Changes the encoding for supported formats. This argument is effective only for supported formats, i.e. ``"wav"`` and ``""flac"```. Valid values are - ``"PCM_S"`` (signed integer Linear PCM) - ``"PCM_U"`` (unsigned integer Linear PCM) - ``"PCM_F"`` (floating point PCM) - ``"ULAW"`` (mu-law) - ``"ALAW"`` (a-law) bits_per_sample (int or None, optional): Changes the bit depth for the supported formats. When ``format`` is one of ``"wav"`` and ``"flac"``, you can change the bit depth. Valid values are ``8``, ``16``, ``24``, ``32`` and ``64``. buffer_size (int, optional): Size of buffer to use when processing file-like objects, in bytes. (Default: ``4096``) backend (str or None, optional): I/O backend to use. If ``None``, function selects backend given input and available backends. Otherwise, must be one of [``"ffmpeg"``, ``"sox"``, ``"soundfile"``], with the corresponding backend being available. (Default: ``None``) .. seealso:: :ref:`backend` compression (CodecConfig, float, int, or None, optional): Compression configuration to apply. If the selected backend is FFmpeg, an instance of :py:class:`CodecConfig` must be provided. Otherwise, if the selected backend is SoX, a float or int value corresponding to option ``-C`` of the ``sox`` command line interface must be provided. For instance: ``"mp3"`` Either bitrate (in ``kbps``) with quality factor, such as ``128.2``, or VBR encoding with quality factor such as ``-4.2``. Default: ``-4.5``. ``"flac"`` Whole number from ``0`` to ``8``. ``8`` is default and highest compression. ``"ogg"``, ``"vorbis"`` Number from ``-1`` to ``10``; ``-1`` is the highest compression and lowest quality. Default: ``3``. Refer to http://sox.sourceforge.net/soxformat.html for more details. """ backend = dispatcher(uri, format, backend) return backend.save( uri, src, sample_rate, channels_first, format, encoding, bits_per_sample, buffer_size, compression ) return save