Skip to content

Configuration#

[general] dist_dir = "/users/bdda/xhao/Datasets_bdda5/multichannel_multispeaker_2022_04_02/train" # output (release) directory max_num_files_per_dir = 1000 # "false" means that all simluated data are saved into one directory

Dataloader#

Mixer #

__init__ #

Initialize the mixer.

Parameters:

Name Type Description Default
snr_list list[float]

signal to noise ratio list.

required
sir_list list[float]

signal-to-interference ratio list.

required
target_rms_level float

defaults to -25.

-25
target_rms_level_floating float

floating value of level. Defaults to -5.

-5
mixing_mode str

defaults to "min".

'min'

mix #

Mix the sources and noise.

Parameters:

Name Type Description Default
sources list[Source]

A list of sources.

required
noise Source

A noise.

required

Returns:

Type Description
Source

mixed source.

mixing_mode = mixing_mode instance-attribute #

sir_list = sir_list instance-attribute #

snr_list = snr_list instance-attribute #

target_rms_level = target_rms_level instance-attribute #

target_rms_level_floating = target_rms_level_floating instance-attribute #

Source #

__init__ #

Initialize a source object.

Parameters:

Name Type Description Default
y np.ndarray

1-D numpy array of audio samples.

required
sr int

sampling rate, default is 16000.

16000

add_rvb #

Filter the RIR with the source audio.

Note

For the function simulateTrajectory, if the dim of num_sources or num_traj is 1, it will omit the first dim. Otherwise, it will do dynamic convolution.

Parameters:

Name Type Description Default
rir np.ndarray

RIR with the shape of [num_sources or num_traj, num_mic, num_channels]

required
further_split_rir bool

If True, it will further split the RIR into direct-path, early and late parts.

False

is_static property #

Check if the source is static or not.

Returns:

Type Description
bool

True if the source is static; False if the source is dynamic.

Raise

ValueError: if the trajectory of the source is not initialized.

loudness_gain: float = -1 instance-attribute #

n_y: np.ndarray | None = None instance-attribute #

path: Path | None = None instance-attribute #

rir: np.ndarray | None = None instance-attribute #

rir_direct_path: np.ndarray | None = None instance-attribute #

rir_early: np.ndarray | None = None instance-attribute #

rir_late: np.ndarray | None = None instance-attribute #

rir_peak_idx: np.ndarray | None = None instance-attribute #

source_id: str | None = None instance-attribute #

source_spk_id: str | None = None instance-attribute #

split_rir #

Split the RIR into direct-path, early and late parts.

sr = sr instance-attribute #

traj: np.ndarray | None = None instance-attribute #

traj_len property #

transcription: str = '' instance-attribute #

vad_label: np.ndarray | None = None instance-attribute #

y = y instance-attribute #

y_rvb: np.ndarray | None = None instance-attribute #

y_rvb_direct_path: np.ndarray | None = None instance-attribute #

y_rvb_early: np.ndarray | None = None instance-attribute #

y_rvb_late: np.ndarray | None = None instance-attribute #

y_vad: np.ndarray | None = None instance-attribute #

loudness_norm_rms #

Loudness normalize a signal based on the Root Mean Square (RMS).

Normalize the RMS of signals to a given RMS based on Decibels Relative to Full Scale (dBFS).

Parameters:

Name Type Description Default
y np.ndarray

[C, T] or [T,].

required
scalar float | None

scalar to normalize the RMS, default to None.

None
target_rms

target RMS in dBFS.

required
ref_mic int

reference mic for multi-channel signals.

-1

Returns:

Type Description
tuple[np.ndarray, float]

Loudness normalized signal and scalar.

Note

A small amount of signal samples would be clipped after normalization, but it does not matter.

scalar_to_desired_sxr #

Generally calculate the gains of interference to fulfill a desired SXR (SNR or SIR) ratio.

Parameters:

Name Type Description Default
meaningful np.ndarray

meaningful input, like target speech.

required
meaningless np.ndarray

meaningless or unwanted input, like background noise.

required
desired_ratio float

SNR or SIR ratio.

required

Returns:

Type Description
float

Gain, which can be used to adjust the RMS of the meaningless signals to satisfy the given ratio.

BaseDataloader #

__init__ #

__len__ #

vad = webrtcvad.Vad() instance-attribute #

Source #

__init__ #

Initialize a source object.

Parameters:

Name Type Description Default
y np.ndarray

1-D numpy array of audio samples.

required
sr int

sampling rate, default is 16000.

16000

add_rvb #

Filter the RIR with the source audio.

Note

For the function simulateTrajectory, if the dim of num_sources or num_traj is 1, it will omit the first dim. Otherwise, it will do dynamic convolution.

Parameters:

Name Type Description Default
rir np.ndarray

RIR with the shape of [num_sources or num_traj, num_mic, num_channels]

required
further_split_rir bool

If True, it will further split the RIR into direct-path, early and late parts.

False

is_static property #

Check if the source is static or not.

Returns:

Type Description
bool

True if the source is static; False if the source is dynamic.

Raise

ValueError: if the trajectory of the source is not initialized.

loudness_gain: float = -1 instance-attribute #

n_y: np.ndarray | None = None instance-attribute #

path: Path | None = None instance-attribute #

rir: np.ndarray | None = None instance-attribute #

rir_direct_path: np.ndarray | None = None instance-attribute #

rir_early: np.ndarray | None = None instance-attribute #

rir_late: np.ndarray | None = None instance-attribute #

rir_peak_idx: np.ndarray | None = None instance-attribute #

source_id: str | None = None instance-attribute #

source_spk_id: str | None = None instance-attribute #

split_rir #

Split the RIR into direct-path, early and late parts.

sr = sr instance-attribute #

traj: np.ndarray | None = None instance-attribute #

traj_len property #

transcription: str = '' instance-attribute #

vad_label: np.ndarray | None = None instance-attribute #

y = y instance-attribute #

y_rvb: np.ndarray | None = None instance-attribute #

y_rvb_direct_path: np.ndarray | None = None instance-attribute #

y_rvb_early: np.ndarray | None = None instance-attribute #

y_rvb_late: np.ndarray | None = None instance-attribute #

y_vad: np.ndarray | None = None instance-attribute #

SourceDataloader #

__getitem__ #

__init__ #

Load data from a database and return Source class.

Parameters:

Name Type Description Default
database str

A file containing file paths of the clean speech files..

required
num_sources int

Number of sources to be loaded.

required
offset int

The offset of the database.

0
limit int | None

The maximum number of files in the database. Default is None.

None
inclide_vad bool

Whether to include the VAD label.

False
sr int

The sampling rate of the audio.

16000
preload bool

Whether to preload the audio with parallel.

False
max_norm bool

Whether to normalize the audio.

True
source_id_fn Callable[[Path], str] | None

A function to get the source id.

None
source_spk_id_fn Callable[[Path], str] | None

A function to get the source speaker id.

None

__len__ #

fpath_list = fpath_list instance-attribute #

inclide_vad = inclide_vad instance-attribute #

max_norm = max_norm instance-attribute #

num_sources = num_sources instance-attribute #

preload = preload instance-attribute #

source_id_fn = source_id_fn instance-attribute #

source_spk_id_fn = source_spk_id_fn instance-attribute #

sr = sr instance-attribute #

waveform_list = [] instance-attribute #

expand_path #

load_wav #

clean_fpath_list = "/users/bdda/xhao/Datasets/wsj0-si/si_tr_s.txt" # clean
clean_list_limit = false
clean_list_offset = 0
noise_fpath_list = "/users/bdda/xhao/Datasets/chime4-noise/chime4-noise.txt"
noise_list_limit = false
noise_list_offset = 0
preload_clean_data = true
preload_noise_data = true
silence_duration = 0.02


[rir_simulator]
[rir_simulator.room]
sr = 16000
t60_list = [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
room_size_range = [[6.0, 6.0, 3.05], [10.0, 8.0, 3.05]]
absorption_coeff_range = [[0.5, 0.5, 0.5, 0.5, 0.5, 0.5], [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]]
[rir_simulator.microphone]
array_setup = "dicit"
[rir_simulator.gpurir_generator]
enable = true
[rir_simulator.pyroomacoustics_generator]
enable = false
[rir_simulator.trajecotry_generator]
moving_speed_range = [1.1, 1.5]
speaker_height_range = [1.5, 1.9]
moving_interval = 0.125
min_allowable_distance_to_wall = 0.5
[rir_simulator.trajecotry_generator.curve]
enable = true
num_control_points = 1
[rir_simulator.trajecotry_generator.line]
enable = true
[rir_simulator.trajecotry_generator.static]
enable = true
[rir_simulator.trajecotry_generator.curved_quadrilateral]
enable = true
num_control_points = 1

[mixer]
snr_list = [-5, 0, 5, 10, 15, 20]
sir_list = [-6, -3, 0, -3, 6]
sr = 16000
target_loudness_level = -25
loudness_floating_value = 5