Bundle adjustment of a multi-camera rig¶
In this example, we will create a ground truth multi-camera rig, generate synthetic observations, perturb the camera poses, and then perform bundle adjustment to optimize the camera poses and 3D point positions.
import numpy as np
import plotly.graph_objects as go
from IPython.display import HTML, display
from scipy.linalg import expm
from deeperfly import Camera, CameraGroup, bundle_adjust
from deeperfly import geometry as geom
# deeperfly's own rendering is headless OpenCV (no plotting deps); the plots below
# are drawn with Plotly directly so the 3D rig views and the 2D overlay stay
# interactive on the rendered docs site.
# Theme-neutral palette: a mid-gray for all text and axis lines reads on both the
# light and dark docs themes, and the figure backgrounds are left transparent so
# the page theme shows through.
FG = "#888888"
GRID = "rgba(136, 136, 136, 0.25)"
def show(fig):
"Style a Plotly figure for both light/dark backgrounds, then render it as self-contained HTML."
fig.update_layout(
paper_bgcolor="rgba(0, 0, 0, 0)",
plot_bgcolor="rgba(0, 0, 0, 0)",
font_color=FG,
title_x=0.5,
title_xanchor="center",
title_font_color=FG,
)
fig.update_xaxes(color=FG, gridcolor=GRID, zerolinecolor=GRID)
fig.update_yaxes(color=FG, gridcolor=GRID, zerolinecolor=GRID)
# 3D scenes: drop the opaque background panes (jarring on a dark page) but keep
# faint gridlines for depth; no-op for figures without a scene.
fig.update_scenes(
xaxis=dict(showbackground=False, gridcolor=GRID, color=FG),
yaxis=dict(showbackground=False, gridcolor=GRID, color=FG),
zaxis=dict(showbackground=False, gridcolor=GRID, color=FG),
)
display(
HTML(
fig.to_html(
include_plotlyjs="cdn", full_html=False, config={"responsive": True}
)
)
)
1. Build the rig¶
Each camera is an orbit spec around a look_at target: the camera sits
distance away in the direction given by azimuth_deg (and elevation_deg,
zero here) and looks back at the target with world +z up.
Camera.from_spec resolves the spec into canonical (rvec, tvec) pairs.
We leave principal_point_px unset and hand each camera its nominal
(height, width), so the principal point is inferred as the image center —
exactly as deeperfly run does from the footage.
AZIMUTHS_DEG = {"rh": -120, "rm": -90, "rf": -45, "f": 0, "lf": 45, "lm": 90, "lh": 120}
SPEC = {
"focal_length_px": 22388.125, # scalar means fx == fy; [fx, fy] also works
"distortion_coefficients": [], # empty means no distortion
"look_at": [0.0, 0.0, 0.0],
"distance": 107.463,
}
group = CameraGroup(
{
name: Camera.from_spec(
{**SPEC, "azimuth_deg": az}, name=name, image_size=(512, 1024)
)
for name, az in AZIMUTHS_DEG.items()
}
)
print("cameras:", group.names)
print("intrinsics [fx, fy, cx, cy]:", group.intrs[0])
cameras: ['rh', 'rm', 'rf', 'f', 'lf', 'lm', 'lh'] intrinsics [fx, fy, cx, cy]: [22388.125 22388.125 511.5 255.5 ]
2. Visualize the rig¶
Each camera is drawn as an RGB triad (x=red, y=green, z=blue/optical axis) at its
world center -R.T @ t.
def plot_camera(rmat, tvec, fig, length=None, name=None, **kwargs):
"Add a camera to a 3D figure as an RGB axis triad (x=red, y=green, z=blue) at its world center, optionally labelled with its name."
center = -rmat.T @ tvec
if length is None:
length = np.linalg.norm(tvec) * 0.2
for axis, color in zip(rmat, ("red", "green", "blue")):
tip = center + axis * length
fig.add_scatter3d(
x=[center[0], tip[0]],
y=[center[1], tip[1]],
z=[center[2], tip[2]],
mode="lines",
line=dict(color=color, width=5),
showlegend=False,
hoverinfo="skip",
**kwargs,
)
if name is not None:
# Label just outside the camera, along the ray from the look-at origin
# through its center, so the text clears the triad. Drawn at full opacity
# (the label trace takes no **kwargs) so it stays legible on a faded rig.
label = center * 1.18
fig.add_scatter3d(
x=[label[0]],
y=[label[1]],
z=[label[2]],
mode="text",
text=[name],
textfont=dict(color=FG, size=12),
showlegend=False,
hoverinfo="skip",
)
def plot_rig(rvecs, tvecs, fig, names=None, **kwargs):
"Add every camera in a rig to a 3D figure as an axis triad, labelling each with its name when `names` is given."
rmats = np.asarray(geom.rvec_to_rmat(rvecs))
names = names if names is not None else [None] * len(tvecs)
for rmat, tvec, name in zip(rmats, tvecs, names):
plot_camera(rmat, tvec, fig, name=name, **kwargs)
def rig_scene(eye=None, up=None):
"A 3D scene framed to hold the whole rig (cameras + triads + labels) on load, so every camera is visible at once."
camera = dict(eye=eye or dict(x=1.25, y=-1.9, z=1.5))
if up is not None:
camera["up"] = up
return dict(
aspectmode="data",
# Ranges padded past the camera centers (radius ~107), their triads and
# the name labels, so nothing is clipped and the rig fills the view.
xaxis=dict(range=[-145, 145]),
yaxis=dict(range=[-145, 145]),
zaxis=dict(range=[-60, 60]),
camera=camera,
)
fig = go.Figure()
plot_rig(group.rvecs, group.tvecs, fig, names=group.names)
fig.add_scatter3d(
x=[0],
y=[0],
z=[0],
mode="markers",
marker=dict(color=FG, symbol="x", size=5),
showlegend=False,
hoverinfo="skip",
) # look-at target
fig.update_layout(
title_text="ground-truth rig",
height=560,
margin=dict(l=0, r=0, t=40, b=0),
scene=rig_scene(),
)
show(fig)
3. Synthesize observations¶
Sample a random 3D cloud near the origin and project it through every camera to get the 2D observations we will fit against.
rng = np.random.default_rng(0)
pts3d_gt = rng.uniform(-0.5, 0.5, size=(100, 3))
pts2d = np.asarray(group.project(pts3d_gt)) # (V, N, 2)
print("observations:", pts2d.shape)
observations: (7, 100, 2)
4. Perturb the cameras¶
Bundle adjustment starts from a noisy guess. We rotate each camera by a small random rotation and jitter its translation.
def small_rotation(sigma, seed):
"Random rotation matrix near identity (axis-angle std `sigma`)."
o = np.random.default_rng(seed).normal(scale=sigma, size=3)
return expm(np.array([[0, -o[2], o[1]], [o[2], 0, -o[0]], [-o[1], o[0], 0]]))
rmats0 = np.array(
[
small_rotation(0.05, i) @ R
for i, R in enumerate(np.asarray(geom.rvec_to_rmat(group.rvecs)))
]
)
rvecs0 = np.asarray(geom.rmat_to_rvec(rmats0))
tvecs0 = group.tvecs + np.random.default_rng(0).normal(
scale=5.0, size=group.tvecs.shape
)
cams0 = CameraGroup.from_arrays(group.names, rvecs0, tvecs0, group.intrs, group.dists)
fig = go.Figure()
plot_rig(group.rvecs, group.tvecs, fig, names=group.names, opacity=0.3)
plot_rig(cams0.rvecs, cams0.tvecs, fig)
fig.update_layout(
title_text="ground truth (faded) vs. perturbed initial guess",
height=560,
margin=dict(l=0, r=0, t=40, b=0),
scene=rig_scene(),
)
show(fig)
5. Bundle-adjust¶
bundle_adjust triangulates an initial point cloud from the (perturbed) cameras,
then jointly refines camera poses and 3D points to minimize reprojection error.
We hold all intrinsics fixed via fixed=["*.intr"]; the gauge (overall world
pose & scale) is left free, so the cost — not the raw parameters — should go to
zero.
res, cams_opt, pts3d_opt = bundle_adjust(cams0, pts2d, fixed=["*.intr"], max_nfev=2000)
reproj = np.asarray(cams_opt.project(pts3d_opt)) - pts2d
rmse = np.sqrt(np.nanmean(reproj**2))
print(f"converged: cost={res.cost:.3e} nfev={res.nfev} status={res.status}")
print(f"reprojection RMSE: {rmse:.3e} px")
converged: cost=6.941e-16 nfev=31 status=3 reprojection RMSE: 9.957e-10 px
6. Align to ground truth and compare¶
Reprojection error is invariant to a global similarity transform of the
scene — rotating, translating and rescaling all cameras and points together
leaves every image point unchanged. So minimizing reprojection alone fixes the
geometry only up to this 7-DOF gauge freedom; the near-zero RMSE above is the
meaningful convergence check, not the raw parameter values. (To pin the gauge
outright, anchor a camera and fix a distance via the fixed / shared keys under
[bundle_adjustment] in a run config.)
To compare the recovered geometry with ground truth we first remove the gauge with a closed-form similarity alignment (Umeyama, 1991) of the camera centers, then apply the same transform to the cameras and points.
def similarity_align(src, dst):
"""Least-squares similarity (scale s, rotation R, translation t) with
``dst ~= s * (R @ src) + t`` (Umeyama 1991)."""
mu_s, mu_d = src.mean(0), dst.mean(0)
src_c, dst_c = src - mu_s, dst - mu_d
cov = dst_c.T @ src_c / len(src)
U, sigma, Vt = np.linalg.svd(cov)
signs = np.ones(3)
signs[-1] = np.sign(np.linalg.det(U @ Vt)) # keep a proper rotation
R = U @ np.diag(signs) @ Vt
s = (sigma * signs).sum() / (src_c**2).sum() * len(src)
return s, R, mu_d - s * R @ mu_s
def camera_centers(rvecs, tvecs):
rmats = np.asarray(geom.rvec_to_rmat(rvecs))
return -np.einsum("oij,oi->oj", rmats, tvecs)
centers_gt = camera_centers(group.rvecs, group.tvecs)
s, R, t = similarity_align(camera_centers(cams_opt.rvecs, cams_opt.tvecs), centers_gt)
# Apply X' = s R X + t to the world: cameras transform as R_i' = R_i R^T,
# t_i' = s t_i - R_i' t (projection is scale-invariant), points as X' = s R X + t.
rmats_aligned = np.asarray(geom.rvec_to_rmat(cams_opt.rvecs)) @ R.T
tvecs_aligned = s * cams_opt.tvecs - np.einsum("oij,j->oi", rmats_aligned, t)
rvecs_aligned = np.asarray(geom.rmat_to_rvec(rmats_aligned))
pts3d_aligned = pts3d_opt @ (s * R).T + t
print(f"recovered scale: {s:.4f}")
print(
"max camera-center error:",
np.max(
np.linalg.norm(
camera_centers(rvecs_aligned, tvecs_aligned) - centers_gt, axis=-1
)
),
)
print("max point error:", np.max(np.linalg.norm(pts3d_aligned - pts3d_gt, axis=-1)))
fig = go.Figure()
plot_rig(group.rvecs, group.tvecs, fig, names=group.names, opacity=0.3)
plot_rig(rvecs_aligned, tvecs_aligned, fig)
fig.update_layout(
title_text="ground truth (faded) vs. recovered (similarity-aligned)",
height=560,
margin=dict(l=0, r=0, t=40, b=0),
# top-down (bird's-eye) view: look straight down +z with +y up.
scene=rig_scene(eye=dict(x=0, y=0, z=2.4), up=dict(x=0, y=1, z=0)),
)
show(fig)
recovered scale: 0.9760 max camera-center error: 4.239051743802059e-09 max point error: 7.103692050014605e-10
7. Robustness to outliers¶
Real observations are never this clean: a 2D detector occasionally locks onto
the wrong feature and reports a confidently wrong point. To test how bundle
adjustment copes, we rerun the experiment with corrupted observations: mild
Gaussian noise (σ = 0.5 px) on every observation to set a realistic error
floor, plus gross outliers — a random 10% of observations displaced by
30–150 px in a random direction. The cameras still start from the perturbed
guess cams0.
rng_corrupt = np.random.default_rng(7)
n_views, n_pts = pts2d.shape[:2]
# mild detection noise on every observation
pts2d_corrupt = pts2d + rng_corrupt.normal(scale=0.5, size=pts2d.shape)
# gross outliers on a random 10%: 30-150 px in a random direction
n_out = round(0.1 * n_views * n_pts)
out_view, out_pt = np.unravel_index(
rng_corrupt.choice(n_views * n_pts, size=n_out, replace=False), (n_views, n_pts)
)
angle = rng_corrupt.uniform(0, 2 * np.pi, size=n_out)
magnitude = rng_corrupt.uniform(30, 150, size=n_out)
pts2d_corrupt[out_view, out_pt] += magnitude[:, None] * np.stack(
[np.cos(angle), np.sin(angle)], axis=-1
)
v = group.names.index("f")
sel = out_pt[out_view == v]
fig = go.Figure()
fig.add_scatter(
x=pts2d[v, :, 0],
y=pts2d[v, :, 1],
mode="markers",
marker=dict(size=5, color="gray"),
name="clean",
)
# displacement segments (clean -> outlier) as one trace, None-separated
xs, ys = [], []
for p in sel:
xs += [pts2d[v, p, 0], pts2d_corrupt[v, p, 0], None]
ys += [pts2d[v, p, 1], pts2d_corrupt[v, p, 1], None]
fig.add_scatter(
x=xs,
y=ys,
mode="lines",
line=dict(color="rgb(214,39,40)", width=1),
opacity=0.6,
showlegend=False,
hoverinfo="skip",
)
fig.add_scatter(
x=pts2d_corrupt[v, sel, 0],
y=pts2d_corrupt[v, sel, 1],
mode="markers",
marker=dict(size=9, color="rgb(214,39,40)", symbol="x"),
name="outlier",
)
# equal aspect; image coordinates have y growing downward
fig.update_yaxes(scaleanchor="x", scaleratio=1, autorange="reversed")
fig.update_layout(
title_text=f"camera 'f': {len(sel)} of {n_pts} observations corrupted",
height=420,
margin=dict(l=0, r=0, t=40, b=0),
)
show(fig)
With the default loss="linear", cost is the sum of squared residuals, so a
single 100 px outlier contributes as much pull as ten thousand 1 px inliers —
the fit trades real accuracy everywhere to shave the few gross errors.
loss="huber" (forwarded, like f_scale, to scipy.optimize.least_squares) is
quadratic for residuals below f_scale and linear above, bounding each
outlier's influence. A proper f_scale sits between the inlier noise level
and the outlier magnitude: here noise is σ = 0.5 px and outliers start at
30 px, so f_scale = 1 works. We also fit with f_scale = 1000, which puts
every residual in the quadratic regime and silently degenerates to the linear
fit.
(An implementation note: scipy's native robust-loss handling rescales each
Jacobian row by a second-order "Triggs" factor that is exactly zero for Huber
residuals beyond f_scale, and the degenerate rows stall its sparse LSMR
trust-region solver far from the optimum. bundle_adjust therefore swaps the
string "huber" for an IRLS-form callable — identical cost and gradient,
ρ″ = 0 — which keeps the subproblem well-posed; see
deeperfly.bundle_adjustment.core._IRLS_LOSSES.)
def evaluate(cams_fit, pts3d_fit):
"""Errors vs. the true projections / geometry (gauge aligned on the points)."""
rmse = np.sqrt(np.nanmean((np.asarray(cams_fit.project(pts3d_fit)) - pts2d) ** 2))
s, R, t = similarity_align(pts3d_fit, pts3d_gt)
pt_err = np.linalg.norm(pts3d_fit @ (s * R).T + t - pts3d_gt, axis=-1)
centers = camera_centers(cams_fit.rvecs, cams_fit.tvecs) @ (s * R).T + t
return (
rmse,
np.median(pt_err),
np.max(np.linalg.norm(centers - centers_gt, axis=-1)),
)
header = f"{'fit':22s}{'RMSE vs true [px]':>19s}{'median point err':>18s}{'max center err':>16s}"
print(header + "\n" + "-" * len(header))
for label, kwargs in {
"linear": {},
"huber, f_scale=1": dict(loss="huber", f_scale=1.0),
"huber, f_scale=1000": dict(loss="huber", f_scale=1000.0),
}.items():
res, cams_fit, pts3d_fit = bundle_adjust(
cams0, pts2d_corrupt, fixed=["*.intr"], max_nfev=2000, **kwargs
)
rmse, pt_err, ctr_err = evaluate(cams_fit, pts3d_fit)
note = "" if res.status > 0 else " <- hit max_nfev"
print(f"{label:22s}{rmse:19.3f}{pt_err:18.4f}{ctr_err:16.3f}{note}")
fit RMSE vs true [px] median point err max center err ---------------------------------------------------------------------------
linear 10.679 0.0540 13.604
huber, f_scale=1 0.371 0.0027 0.753
huber, f_scale=1000 10.668 0.0529 13.437
The linear fit converges, but to the wrong rig: the 10% of gross outliers leave
a ~10 px error floor against the true projections and pull the camera centers
visibly off. The Huber fit with f_scale=1 recovers the geometry down to the
noise floor — a ~30× improvement — because the bounded outlier influence lets
the 90% clean observations dominate. The f_scale=1000 row shows that the
threshold matters: it exceeds every residual, so the "robust" fit is exactly
the linear one.
In a run config the robust fit maps to flat keys under
[bundle_adjustment]:
[bundle_adjustment]
loss = "huber"
f_scale = 1.0
One caveat: Huber bounds each outlier's pull but does not zero it, so a point
whose observations are mostly corrupted is still dragged (the worst point here
has 4 of its 7 views corrupted). For heavier contamination, downweight or drop
suspect observations before fitting — bundle_adjust accepts per-observation
weights, which the pipeline fills with detector confidences.