Getting Started
This page demonstrates an end-to-end workflow with speech signals from CMU ARCTIC:
- Download utterances for two speakers via
torchrir.datasets. - Build 10-second source signals by concatenating random utterances and clipping.
- Simulate static and dynamic RIRs under constrained source-array geometry.
- Save WAV outputs, layout plots, waveform/spectrogram plots, and a dynamic GIF.
All code blocks on this page are sourced from examples/getting_started.py.
Install
pip install torchrir
0) Common Setup (Dataset + Geometry Constraints)
torch.manual_seed(42)
rng = random.Random(42)
out_dir = Path("docs/assets/getting-started")
out_dir.mkdir(parents=True, exist_ok=True)
# Download CMU ARCTIC data as needed and load two unique speakers.
dataset_root = Path("datasets/cmu_arctic")
signals, fs, source_info = load_dataset_sources(
dataset_factory=lambda spk: CmuArcticDataset(
dataset_root, speaker=spk or "bdl", download=True
),
num_sources=2,
duration_s=10.0, # concatenate random utterances until >=10 s, then clip to 10 s
rng=rng,
)
print("Selected speakers:", [speaker for speaker, _ in source_info])
print("Loaded original signal shape:", tuple(signals.shape)) # (2, fs * 10)
room = Room.shoebox(size=[8.0, 6.0, 3.0], fs=fs, beta=[0.9] * 6)
# Use a slightly jittered room-center position for the mic array to avoid
# exact-center artifacts in symmetric setups.
room_center = (room.size / 2.0).to(torch.float32)
mic_jitter = torch.tensor(
[
rng.uniform(-0.03, 0.03),
rng.uniform(-0.03, 0.03),
rng.uniform(-0.01, 0.01),
],
dtype=torch.float32,
)
mic_center = room_center + mic_jitter
mic_pos = arrays.binaural_array(mic_center, offset=0.10) # 20 cm spacing
mics = MicrophoneArray.from_positions(mic_pos)
# Place two sources at radius >= 2 m from array center with >= 30 deg separation.
source_radius = 2.2
source_angles_deg = [30.0, 150.0]
src_pos = []
for deg in source_angles_deg:
theta = math.radians(deg)
src_pos.append(
[
mic_center[0].item() + source_radius * math.cos(theta),
mic_center[1].item() + source_radius * math.sin(theta),
1.5,
]
)
src_pos = torch.tensor(src_pos, dtype=torch.float32)
relative_xy = src_pos[:, :2] - mic_center[:2]
radii = torch.linalg.norm(relative_xy, dim=1)
angle_gap = abs(source_angles_deg[1] - source_angles_deg[0])
assert bool(torch.all(radii >= 2.0))
assert angle_gap >= 30.0
sources_static = Source.from_positions(src_pos)
1) Static RIR + Convolution + Plots
device = "auto"
rirs_static = simulate_rir(
room=room,
sources=sources_static,
mics=mics,
max_order=6,
tmax=0.4,
directivity="omni",
device=device,
)
print("Static RIR shape:", tuple(rirs_static.shape)) # (n_src, n_mic, rir_len)
original_static = signals.to(rirs_static.device, dtype=rirs_static.dtype)
convolved_static = convolve_rir(original_static, rirs_static)
print(
"Static convolved shape:", tuple(convolved_static.shape)
) # (n_mic, n_samples + rir_len - 1)
# Save original and convolved audio.
save_wav(out_dir / "static_original_src01.wav", signals[0], fs)
save_wav(out_dir / "static_original_src02.wav", signals[1], fs)
save_wav(out_dir / "static_convolved.wav", convolved_static, fs)
# Save static layout image (no animation in static mode).
ax = plot_scene_static(
room=room.size[:2],
sources=sources_static.positions[:, :2],
mics=mics.positions[:, :2],
title="Static layout (top view)",
)
ax.figure.savefig(out_dir / "layout_static.png", dpi=150, bbox_inches="tight")
plt.close(ax.figure)
# Save waveform+spectrogram pair plots (seconds on the x-axis).
save_waveform_spectrogram_pair(
signal=signals[0],
fs=fs,
out_path=out_dir / "static_original_src01_pair.png",
title="Static original source 1",
)
save_waveform_spectrogram_pair(
signal=signals[1],
fs=fs,
out_path=out_dir / "static_original_src02_pair.png",
title="Static original source 2",
)
save_waveform_spectrogram_pair(
signal=convolved_static[0],
fs=fs,
out_path=out_dir / "static_convolved_mic1_pair.png",
title="Static convolved mic 1",
)
save_waveform_spectrogram_pair(
signal=convolved_static[1],
fs=fs,
out_path=out_dir / "static_convolved_mic2_pair.png",
title="Static convolved mic 2",
)
Static preview (generated by running the code above):

Source 1 (original):
Source 2 (original):
Mic mixture (convolved):
2) Dynamic RIR + Trajectory Convolution + Animation
steps = 128
# Source 1 stays fixed; source 2 moves toward source 1.
src0 = src_pos[0].unsqueeze(0).repeat(steps, 1) # (T, 3)
src1_start = src_pos[1]
src1_end = src_pos[0] + torch.tensor([0.35, 0.10, 0.0], dtype=torch.float32)
alpha = torch.linspace(0.0, 1.0, steps, dtype=torch.float32).unsqueeze(1)
src1 = src1_start.unsqueeze(0) + alpha * (src1_end - src1_start).unsqueeze(0)
src_traj = torch.stack([src0, src1], dim=1) # (T, 2, 3)
mic_traj = mics.positions.unsqueeze(0).repeat(steps, 1, 1) # (T, n_mic, 3)
sources_dynamic = Source.from_positions(src_traj[0])
dist_start = torch.linalg.norm(src_traj[0, 1] - src_traj[0, 0]).item()
dist_end = torch.linalg.norm(src_traj[-1, 1] - src_traj[-1, 0]).item()
assert dist_end < dist_start
rirs_dynamic = simulate_dynamic_rir(
room=room,
src_traj=src_traj,
mic_traj=mic_traj,
max_order=6,
tmax=0.4,
directivity="omni",
device=device,
)
print("Dynamic RIR shape:", tuple(rirs_dynamic.shape)) # (T, n_src, n_mic, rir_len)
original_dynamic = signals.to(rirs_dynamic.device, dtype=rirs_dynamic.dtype)
convolved_dynamic = DynamicConvolver(mode="trajectory").convolve(
original_dynamic, rirs_dynamic
)
print(
"Dynamic convolved shape:", tuple(convolved_dynamic.shape)
) # (n_mic, n_samples + rir_len - 1)
# Save original and convolved audio.
save_wav(out_dir / "dynamic_original_src01.wav", signals[0], fs)
save_wav(out_dir / "dynamic_original_src02.wav", signals[1], fs)
save_wav(out_dir / "dynamic_convolved.wav", convolved_dynamic, fs)
# Save dynamic layout animation.
animate_scene_gif(
out_path=out_dir / "layout_dynamic.gif",
room=room.size,
sources=sources_dynamic,
mics=mics,
src_traj=src_traj,
mic_traj=mic_traj,
signal_len=signals.shape[1],
fs=fs,
)
# Save waveform+spectrogram pair plots (seconds on the x-axis).
save_waveform_spectrogram_pair(
signal=signals[0],
fs=fs,
out_path=out_dir / "dynamic_original_src01_pair.png",
title="Dynamic original source 1",
)
save_waveform_spectrogram_pair(
signal=signals[1],
fs=fs,
out_path=out_dir / "dynamic_original_src02_pair.png",
title="Dynamic original source 2",
)
save_waveform_spectrogram_pair(
signal=convolved_dynamic[0],
fs=fs,
out_path=out_dir / "dynamic_convolved_mic1_pair.png",
title="Dynamic convolved mic 1",
)
save_waveform_spectrogram_pair(
signal=convolved_dynamic[1],
fs=fs,
out_path=out_dir / "dynamic_convolved_mic2_pair.png",
title="Dynamic convolved mic 2",
)
Dynamic preview (generated by running the code above):

Source 1 (original):
Source 2 (original):
Mic mixture (convolved):
Note
The first dataset download can take time and requires network access.
GIF generation requires Pillow through Matplotlib's animation writer.
device="auto" may select mps or cuda; if you want a warning-free tutorial run, use device="cpu".
Next Steps
- See Examples for CLI workflows and dataset generation scripts.
- See API documentation for all options and full signatures.