Bundle adjustment of a multi-camera rig¶

In this example, we will create a ground truth multi-camera rig, generate synthetic observations, perturb the camera poses, and then perform bundle adjustment to optimize the camera poses and 3D point positions.

In [2]:

Copied!





import numpy as np
import plotly.graph_objects as go
from IPython.display import HTML, display
from scipy.linalg import expm

from deeperfly import Camera, CameraGroup, bundle_adjust
from deeperfly import geometry as geom

# deeperfly's own rendering is headless OpenCV (no plotting deps); the plots below
# are drawn with Plotly directly so the 3D rig views and the 2D overlay stay
# interactive on the rendered docs site.

# Theme-neutral palette: a mid-gray for all text and axis lines reads on both the
# light and dark docs themes, and the figure backgrounds are left transparent so
# the page theme shows through.
FG = "#888888"
GRID = "rgba(136, 136, 136, 0.25)"


def show(fig):
    "Style a Plotly figure for both light/dark backgrounds, then render it as self-contained HTML."
    fig.update_layout(
        paper_bgcolor="rgba(0, 0, 0, 0)",
        plot_bgcolor="rgba(0, 0, 0, 0)",
        font_color=FG,
        title_x=0.5,
        title_xanchor="center",
        title_font_color=FG,
    )
    fig.update_xaxes(color=FG, gridcolor=GRID, zerolinecolor=GRID)
    fig.update_yaxes(color=FG, gridcolor=GRID, zerolinecolor=GRID)
    # 3D scenes: drop the opaque background panes (jarring on a dark page) but keep
    # faint gridlines for depth; no-op for figures without a scene.
    fig.update_scenes(
        xaxis=dict(showbackground=False, gridcolor=GRID, color=FG),
        yaxis=dict(showbackground=False, gridcolor=GRID, color=FG),
        zaxis=dict(showbackground=False, gridcolor=GRID, color=FG),
    )
    display(
        HTML(
            fig.to_html(
                include_plotlyjs="cdn", full_html=False, config={"responsive": True}
            )
        )
    )
import numpy as np
import plotly.graph_objects as go
from IPython.display import HTML, display
from scipy.linalg import expm

from deeperfly import Camera, CameraGroup, bundle_adjust
from deeperfly import geometry as geom

# deeperfly's own rendering is headless OpenCV (no plotting deps); the plots below
# are drawn with Plotly directly so the 3D rig views and the 2D overlay stay
# interactive on the rendered docs site.

# Theme-neutral palette: a mid-gray for all text and axis lines reads on both the
# light and dark docs themes, and the figure backgrounds are left transparent so
# the page theme shows through.
FG = "#888888"
GRID = "rgba(136, 136, 136, 0.25)"


def show(fig):
    "Style a Plotly figure for both light/dark backgrounds, then render it as self-contained HTML."
    fig.update_layout(
        paper_bgcolor="rgba(0, 0, 0, 0)",
        plot_bgcolor="rgba(0, 0, 0, 0)",
        font_color=FG,
        title_x=0.5,
        title_xanchor="center",
        title_font_color=FG,
    )
    fig.update_xaxes(color=FG, gridcolor=GRID, zerolinecolor=GRID)
    fig.update_yaxes(color=FG, gridcolor=GRID, zerolinecolor=GRID)
    # 3D scenes: drop the opaque background panes (jarring on a dark page) but keep
    # faint gridlines for depth; no-op for figures without a scene.
    fig.update_scenes(
        xaxis=dict(showbackground=False, gridcolor=GRID, color=FG),
        yaxis=dict(showbackground=False, gridcolor=GRID, color=FG),
        zaxis=dict(showbackground=False, gridcolor=GRID, color=FG),
    )
    display(
        HTML(
            fig.to_html(
                include_plotlyjs="cdn", full_html=False, config={"responsive": True}
            )
        )
    )

1. Build the rig¶

Each camera is an orbit spec around a look_at target: the camera sits distance away in the direction given by azimuth_deg (and elevation_deg, zero here) and looks back at the target with world +z up. Camera.from_spec resolves the spec into canonical (rvec, tvec) pairs. We leave principal_point_px unset and hand each camera its nominal (height, width), so the principal point is inferred as the image center — exactly as deeperfly run does from the footage.

In [3]:

Copied!





AZIMUTHS_DEG = {"rh": -120, "rm": -90, "rf": -45, "f": 0, "lf": 45, "lm": 90, "lh": 120}
SPEC = {
    "focal_length_px": 22388.125,  # scalar means fx == fy; [fx, fy] also works
    "distortion_coefficients": [],  # empty means no distortion
    "look_at": [0.0, 0.0, 0.0],
    "distance": 107.463,
}

group = CameraGroup(
    {
        name: Camera.from_spec(
            {**SPEC, "azimuth_deg": az}, name=name, image_size=(512, 1024)
        )
        for name, az in AZIMUTHS_DEG.items()
    }
)
print("cameras:", group.names)
print("intrinsics [fx, fy, cx, cy]:", group.intrs[0])
AZIMUTHS_DEG = {"rh": -120, "rm": -90, "rf": -45, "f": 0, "lf": 45, "lm": 90, "lh": 120}
SPEC = {
    "focal_length_px": 22388.125,  # scalar means fx == fy; [fx, fy] also works
    "distortion_coefficients": [],  # empty means no distortion
    "look_at": [0.0, 0.0, 0.0],
    "distance": 107.463,
}

group = CameraGroup(
    {
        name: Camera.from_spec(
            {**SPEC, "azimuth_deg": az}, name=name, image_size=(512, 1024)
        )
        for name, az in AZIMUTHS_DEG.items()
    }
)
print("cameras:", group.names)
print("intrinsics [fx, fy, cx, cy]:", group.intrs[0])

cameras: ['rh', 'rm', 'rf', 'f', 'lf', 'lm', 'lh']
intrinsics [fx, fy, cx, cy]: [22388.125 22388.125   511.5     255.5  ]

2. Visualize the rig¶

Each camera is drawn as an RGB triad (x=red, y=green, z=blue/optical axis) at its world center -R.T @ t.

In [4]:

Copied!





def plot_camera(rmat, tvec, fig, length=None, name=None, **kwargs):
    "Add a camera to a 3D figure as an RGB axis triad (x=red, y=green, z=blue) at its world center, optionally labelled with its name."
    center = -rmat.T @ tvec
    if length is None:
        length = np.linalg.norm(tvec) * 0.2
    for axis, color in zip(rmat, ("red", "green", "blue")):
        tip = center + axis * length
        fig.add_scatter3d(
            x=[center[0], tip[0]],
            y=[center[1], tip[1]],
            z=[center[2], tip[2]],
            mode="lines",
            line=dict(color=color, width=5),
            showlegend=False,
            hoverinfo="skip",
            **kwargs,
        )
    if name is not None:
        # Label just outside the camera, along the ray from the look-at origin
        # through its center, so the text clears the triad. Drawn at full opacity
        # (the label trace takes no **kwargs) so it stays legible on a faded rig.
        label = center * 1.18
        fig.add_scatter3d(
            x=[label[0]],
            y=[label[1]],
            z=[label[2]],
            mode="text",
            text=[name],
            textfont=dict(color=FG, size=12),
            showlegend=False,
            hoverinfo="skip",
        )


def plot_rig(rvecs, tvecs, fig, names=None, **kwargs):
    "Add every camera in a rig to a 3D figure as an axis triad, labelling each with its name when `names` is given."
    rmats = np.asarray(geom.rvec_to_rmat(rvecs))
    names = names if names is not None else [None] * len(tvecs)
    for rmat, tvec, name in zip(rmats, tvecs, names):
        plot_camera(rmat, tvec, fig, name=name, **kwargs)


def rig_scene(eye=None, up=None):
    "A 3D scene framed to hold the whole rig (cameras + triads + labels) on load, so every camera is visible at once."
    camera = dict(eye=eye or dict(x=1.25, y=-1.9, z=1.5))
    if up is not None:
        camera["up"] = up
    return dict(
        aspectmode="data",
        # Ranges padded past the camera centers (radius ~107), their triads and
        # the name labels, so nothing is clipped and the rig fills the view.
        xaxis=dict(range=[-145, 145]),
        yaxis=dict(range=[-145, 145]),
        zaxis=dict(range=[-60, 60]),
        camera=camera,
    )


fig = go.Figure()
plot_rig(group.rvecs, group.tvecs, fig, names=group.names)
fig.add_scatter3d(
    x=[0],
    y=[0],
    z=[0],
    mode="markers",
    marker=dict(color=FG, symbol="x", size=5),
    showlegend=False,
    hoverinfo="skip",
)  # look-at target
fig.update_layout(
    title_text="ground-truth rig",
    height=560,
    margin=dict(l=0, r=0, t=40, b=0),
    scene=rig_scene(),
)
show(fig)
def plot_camera(rmat, tvec, fig, length=None, name=None, **kwargs):
    "Add a camera to a 3D figure as an RGB axis triad (x=red, y=green, z=blue) at its world center, optionally labelled with its name."
    center = -rmat.T @ tvec
    if length is None:
        length = np.linalg.norm(tvec) * 0.2
    for axis, color in zip(rmat, ("red", "green", "blue")):
        tip = center + axis * length
        fig.add_scatter3d(
            x=[center[0], tip[0]],
            y=[center[1], tip[1]],
            z=[center[2], tip[2]],
            mode="lines",
            line=dict(color=color, width=5),
            showlegend=False,
            hoverinfo="skip",
            **kwargs,
        )
    if name is not None:
        # Label just outside the camera, along the ray from the look-at origin
        # through its center, so the text clears the triad. Drawn at full opacity
        # (the label trace takes no **kwargs) so it stays legible on a faded rig.
        label = center * 1.18
        fig.add_scatter3d(
            x=[label[0]],
            y=[label[1]],
            z=[label[2]],
            mode="text",
            text=[name],
            textfont=dict(color=FG, size=12),
            showlegend=False,
            hoverinfo="skip",
        )


def plot_rig(rvecs, tvecs, fig, names=None, **kwargs):
    "Add every camera in a rig to a 3D figure as an axis triad, labelling each with its name when `names` is given."
    rmats = np.asarray(geom.rvec_to_rmat(rvecs))
    names = names if names is not None else [None] * len(tvecs)
    for rmat, tvec, name in zip(rmats, tvecs, names):
        plot_camera(rmat, tvec, fig, name=name, **kwargs)


def rig_scene(eye=None, up=None):
    "A 3D scene framed to hold the whole rig (cameras + triads + labels) on load, so every camera is visible at once."
    camera = dict(eye=eye or dict(x=1.25, y=-1.9, z=1.5))
    if up is not None:
        camera["up"] = up
    return dict(
        aspectmode="data",
        # Ranges padded past the camera centers (radius ~107), their triads and
        # the name labels, so nothing is clipped and the rig fills the view.
        xaxis=dict(range=[-145, 145]),
        yaxis=dict(range=[-145, 145]),
        zaxis=dict(range=[-60, 60]),
        camera=camera,
    )


fig = go.Figure()
plot_rig(group.rvecs, group.tvecs, fig, names=group.names)
fig.add_scatter3d(
    x=[0],
    y=[0],
    z=[0],
    mode="markers",
    marker=dict(color=FG, symbol="x", size=5),
    showlegend=False,
    hoverinfo="skip",
)  # look-at target
fig.update_layout(
    title_text="ground-truth rig",
    height=560,
    margin=dict(l=0, r=0, t=40, b=0),
    scene=rig_scene(),
)
show(fig)

3. Synthesize observations¶

Sample a random 3D cloud near the origin and project it through every camera to get the 2D observations we will fit against.

In [5]:

Copied!





rng = np.random.default_rng(0)
pts3d_gt = rng.uniform(-0.5, 0.5, size=(100, 3))
pts2d = np.asarray(group.project(pts3d_gt))  # (V, N, 2)
print("observations:", pts2d.shape)
rng = np.random.default_rng(0)
pts3d_gt = rng.uniform(-0.5, 0.5, size=(100, 3))
pts2d = np.asarray(group.project(pts3d_gt))  # (V, N, 2)
print("observations:", pts2d.shape)

observations: (7, 100, 2)

4. Perturb the cameras¶

Bundle adjustment starts from a noisy guess. We rotate each camera by a small random rotation and jitter its translation.

In [6]:

Copied!





def small_rotation(sigma, seed):
    "Random rotation matrix near identity (axis-angle std `sigma`)."
    o = np.random.default_rng(seed).normal(scale=sigma, size=3)
    return expm(np.array([[0, -o[2], o[1]], [o[2], 0, -o[0]], [-o[1], o[0], 0]]))


rmats0 = np.array(
    [
        small_rotation(0.05, i) @ R
        for i, R in enumerate(np.asarray(geom.rvec_to_rmat(group.rvecs)))
    ]
)
rvecs0 = np.asarray(geom.rmat_to_rvec(rmats0))
tvecs0 = group.tvecs + np.random.default_rng(0).normal(
    scale=5.0, size=group.tvecs.shape
)
cams0 = CameraGroup.from_arrays(group.names, rvecs0, tvecs0, group.intrs, group.dists)

fig = go.Figure()
plot_rig(group.rvecs, group.tvecs, fig, names=group.names, opacity=0.3)
plot_rig(cams0.rvecs, cams0.tvecs, fig)
fig.update_layout(
    title_text="ground truth (faded) vs. perturbed initial guess",
    height=560,
    margin=dict(l=0, r=0, t=40, b=0),
    scene=rig_scene(),
)
show(fig)
def small_rotation(sigma, seed):
    "Random rotation matrix near identity (axis-angle std `sigma`)."
    o = np.random.default_rng(seed).normal(scale=sigma, size=3)
    return expm(np.array([[0, -o[2], o[1]], [o[2], 0, -o[0]], [-o[1], o[0], 0]]))


rmats0 = np.array(
    [
        small_rotation(0.05, i) @ R
        for i, R in enumerate(np.asarray(geom.rvec_to_rmat(group.rvecs)))
    ]
)
rvecs0 = np.asarray(geom.rmat_to_rvec(rmats0))
tvecs0 = group.tvecs + np.random.default_rng(0).normal(
    scale=5.0, size=group.tvecs.shape
)
cams0 = CameraGroup.from_arrays(group.names, rvecs0, tvecs0, group.intrs, group.dists)

fig = go.Figure()
plot_rig(group.rvecs, group.tvecs, fig, names=group.names, opacity=0.3)
plot_rig(cams0.rvecs, cams0.tvecs, fig)
fig.update_layout(
    title_text="ground truth (faded) vs. perturbed initial guess",
    height=560,
    margin=dict(l=0, r=0, t=40, b=0),
    scene=rig_scene(),
)
show(fig)

5. Bundle-adjust¶

bundle_adjust triangulates an initial point cloud from the (perturbed) cameras, then jointly refines camera poses and 3D points to minimize reprojection error. We hold all intrinsics fixed via fixed=["*.intr"]; the gauge (overall world pose & scale) is left free, so the cost — not the raw parameters — should go to zero.

In [7]:

Copied!





res, cams_opt, pts3d_opt = bundle_adjust(cams0, pts2d, fixed=["*.intr"], max_nfev=2000)

reproj = np.asarray(cams_opt.project(pts3d_opt)) - pts2d
rmse = np.sqrt(np.nanmean(reproj**2))
print(f"converged: cost={res.cost:.3e}  nfev={res.nfev}  status={res.status}")
print(f"reprojection RMSE: {rmse:.3e} px")
res, cams_opt, pts3d_opt = bundle_adjust(cams0, pts2d, fixed=["*.intr"], max_nfev=2000)

reproj = np.asarray(cams_opt.project(pts3d_opt)) - pts2d
rmse = np.sqrt(np.nanmean(reproj**2))
print(f"converged: cost={res.cost:.3e}  nfev={res.nfev}  status={res.status}")
print(f"reprojection RMSE: {rmse:.3e} px")

converged: cost=6.941e-16  nfev=31  status=3
reprojection RMSE: 9.957e-10 px

6. Align to ground truth and compare¶

Reprojection error is invariant to a global similarity transform of the scene — rotating, translating and rescaling all cameras and points together leaves every image point unchanged. So minimizing reprojection alone fixes the geometry only up to this 7-DOF gauge freedom; the near-zero RMSE above is the meaningful convergence check, not the raw parameter values. (To pin the gauge outright, anchor a camera and fix a distance via the fixed / shared keys under [bundle_adjustment] in a run config.)

To compare the recovered geometry with ground truth we first remove the gauge with a closed-form similarity alignment (Umeyama, 1991) of the camera centers, then apply the same transform to the cameras and points.

In [8]:

Copied!





def similarity_align(src, dst):
    """Least-squares similarity (scale s, rotation R, translation t) with
    ``dst ~= s * (R @ src) + t`` (Umeyama 1991)."""
    mu_s, mu_d = src.mean(0), dst.mean(0)
    src_c, dst_c = src - mu_s, dst - mu_d
    cov = dst_c.T @ src_c / len(src)
    U, sigma, Vt = np.linalg.svd(cov)
    signs = np.ones(3)
    signs[-1] = np.sign(np.linalg.det(U @ Vt))  # keep a proper rotation
    R = U @ np.diag(signs) @ Vt
    s = (sigma * signs).sum() / (src_c**2).sum() * len(src)
    return s, R, mu_d - s * R @ mu_s


def camera_centers(rvecs, tvecs):
    rmats = np.asarray(geom.rvec_to_rmat(rvecs))
    return -np.einsum("oij,oi->oj", rmats, tvecs)


centers_gt = camera_centers(group.rvecs, group.tvecs)
s, R, t = similarity_align(camera_centers(cams_opt.rvecs, cams_opt.tvecs), centers_gt)

# Apply X' = s R X + t to the world: cameras transform as R_i' = R_i R^T,
# t_i' = s t_i - R_i' t (projection is scale-invariant), points as X' = s R X + t.
rmats_aligned = np.asarray(geom.rvec_to_rmat(cams_opt.rvecs)) @ R.T
tvecs_aligned = s * cams_opt.tvecs - np.einsum("oij,j->oi", rmats_aligned, t)
rvecs_aligned = np.asarray(geom.rmat_to_rvec(rmats_aligned))
pts3d_aligned = pts3d_opt @ (s * R).T + t

print(f"recovered scale: {s:.4f}")
print(
    "max camera-center error:",
    np.max(
        np.linalg.norm(
            camera_centers(rvecs_aligned, tvecs_aligned) - centers_gt, axis=-1
        )
    ),
)
print("max point error:", np.max(np.linalg.norm(pts3d_aligned - pts3d_gt, axis=-1)))

fig = go.Figure()
plot_rig(group.rvecs, group.tvecs, fig, names=group.names, opacity=0.3)
plot_rig(rvecs_aligned, tvecs_aligned, fig)
fig.update_layout(
    title_text="ground truth (faded) vs. recovered (similarity-aligned)",
    height=560,
    margin=dict(l=0, r=0, t=40, b=0),
    # top-down (bird's-eye) view: look straight down +z with +y up.
    scene=rig_scene(eye=dict(x=0, y=0, z=2.4), up=dict(x=0, y=1, z=0)),
)
show(fig)
def similarity_align(src, dst):
    """Least-squares similarity (scale s, rotation R, translation t) with
    ``dst ~= s * (R @ src) + t`` (Umeyama 1991)."""
    mu_s, mu_d = src.mean(0), dst.mean(0)
    src_c, dst_c = src - mu_s, dst - mu_d
    cov = dst_c.T @ src_c / len(src)
    U, sigma, Vt = np.linalg.svd(cov)
    signs = np.ones(3)
    signs[-1] = np.sign(np.linalg.det(U @ Vt))  # keep a proper rotation
    R = U @ np.diag(signs) @ Vt
    s = (sigma * signs).sum() / (src_c**2).sum() * len(src)
    return s, R, mu_d - s * R @ mu_s


def camera_centers(rvecs, tvecs):
    rmats = np.asarray(geom.rvec_to_rmat(rvecs))
    return -np.einsum("oij,oi->oj", rmats, tvecs)


centers_gt = camera_centers(group.rvecs, group.tvecs)
s, R, t = similarity_align(camera_centers(cams_opt.rvecs, cams_opt.tvecs), centers_gt)

# Apply X' = s R X + t to the world: cameras transform as R_i' = R_i R^T,
# t_i' = s t_i - R_i' t (projection is scale-invariant), points as X' = s R X + t.
rmats_aligned = np.asarray(geom.rvec_to_rmat(cams_opt.rvecs)) @ R.T
tvecs_aligned = s * cams_opt.tvecs - np.einsum("oij,j->oi", rmats_aligned, t)
rvecs_aligned = np.asarray(geom.rmat_to_rvec(rmats_aligned))
pts3d_aligned = pts3d_opt @ (s * R).T + t

print(f"recovered scale: {s:.4f}")
print(
    "max camera-center error:",
    np.max(
        np.linalg.norm(
            camera_centers(rvecs_aligned, tvecs_aligned) - centers_gt, axis=-1
        )
    ),
)
print("max point error:", np.max(np.linalg.norm(pts3d_aligned - pts3d_gt, axis=-1)))

fig = go.Figure()
plot_rig(group.rvecs, group.tvecs, fig, names=group.names, opacity=0.3)
plot_rig(rvecs_aligned, tvecs_aligned, fig)
fig.update_layout(
    title_text="ground truth (faded) vs. recovered (similarity-aligned)",
    height=560,
    margin=dict(l=0, r=0, t=40, b=0),
    # top-down (bird's-eye) view: look straight down +z with +y up.
    scene=rig_scene(eye=dict(x=0, y=0, z=2.4), up=dict(x=0, y=1, z=0)),
)
show(fig)

recovered scale: 0.9760
max camera-center error: 4.239051743802059e-09
max point error: 7.103692050014605e-10

7. Robustness to outliers¶

Real observations are never this clean: a 2D detector occasionally locks onto the wrong feature and reports a confidently wrong point. To test how bundle adjustment copes, we rerun the experiment with corrupted observations: mild Gaussian noise (σ = 0.5 px) on every observation to set a realistic error floor, plus gross outliers — a random 10% of observations displaced by 30–150 px in a random direction. The cameras still start from the perturbed guess cams0.

In [9]:

Copied!





rng_corrupt = np.random.default_rng(7)
n_views, n_pts = pts2d.shape[:2]

# mild detection noise on every observation
pts2d_corrupt = pts2d + rng_corrupt.normal(scale=0.5, size=pts2d.shape)

# gross outliers on a random 10%: 30-150 px in a random direction
n_out = round(0.1 * n_views * n_pts)
out_view, out_pt = np.unravel_index(
    rng_corrupt.choice(n_views * n_pts, size=n_out, replace=False), (n_views, n_pts)
)
angle = rng_corrupt.uniform(0, 2 * np.pi, size=n_out)
magnitude = rng_corrupt.uniform(30, 150, size=n_out)
pts2d_corrupt[out_view, out_pt] += magnitude[:, None] * np.stack(
    [np.cos(angle), np.sin(angle)], axis=-1
)

v = group.names.index("f")
sel = out_pt[out_view == v]

fig = go.Figure()
fig.add_scatter(
    x=pts2d[v, :, 0],
    y=pts2d[v, :, 1],
    mode="markers",
    marker=dict(size=5, color="gray"),
    name="clean",
)
# displacement segments (clean -> outlier) as one trace, None-separated
xs, ys = [], []
for p in sel:
    xs += [pts2d[v, p, 0], pts2d_corrupt[v, p, 0], None]
    ys += [pts2d[v, p, 1], pts2d_corrupt[v, p, 1], None]
fig.add_scatter(
    x=xs,
    y=ys,
    mode="lines",
    line=dict(color="rgb(214,39,40)", width=1),
    opacity=0.6,
    showlegend=False,
    hoverinfo="skip",
)
fig.add_scatter(
    x=pts2d_corrupt[v, sel, 0],
    y=pts2d_corrupt[v, sel, 1],
    mode="markers",
    marker=dict(size=9, color="rgb(214,39,40)", symbol="x"),
    name="outlier",
)
# equal aspect; image coordinates have y growing downward
fig.update_yaxes(scaleanchor="x", scaleratio=1, autorange="reversed")
fig.update_layout(
    title_text=f"camera 'f': {len(sel)} of {n_pts} observations corrupted",
    height=420,
    margin=dict(l=0, r=0, t=40, b=0),
)
show(fig)
rng_corrupt = np.random.default_rng(7)
n_views, n_pts = pts2d.shape[:2]

# mild detection noise on every observation
pts2d_corrupt = pts2d + rng_corrupt.normal(scale=0.5, size=pts2d.shape)

# gross outliers on a random 10%: 30-150 px in a random direction
n_out = round(0.1 * n_views * n_pts)
out_view, out_pt = np.unravel_index(
    rng_corrupt.choice(n_views * n_pts, size=n_out, replace=False), (n_views, n_pts)
)
angle = rng_corrupt.uniform(0, 2 * np.pi, size=n_out)
magnitude = rng_corrupt.uniform(30, 150, size=n_out)
pts2d_corrupt[out_view, out_pt] += magnitude[:, None] * np.stack(
    [np.cos(angle), np.sin(angle)], axis=-1
)

v = group.names.index("f")
sel = out_pt[out_view == v]

fig = go.Figure()
fig.add_scatter(
    x=pts2d[v, :, 0],
    y=pts2d[v, :, 1],
    mode="markers",
    marker=dict(size=5, color="gray"),
    name="clean",
)
# displacement segments (clean -> outlier) as one trace, None-separated
xs, ys = [], []
for p in sel:
    xs += [pts2d[v, p, 0], pts2d_corrupt[v, p, 0], None]
    ys += [pts2d[v, p, 1], pts2d_corrupt[v, p, 1], None]
fig.add_scatter(
    x=xs,
    y=ys,
    mode="lines",
    line=dict(color="rgb(214,39,40)", width=1),
    opacity=0.6,
    showlegend=False,
    hoverinfo="skip",
)
fig.add_scatter(
    x=pts2d_corrupt[v, sel, 0],
    y=pts2d_corrupt[v, sel, 1],
    mode="markers",
    marker=dict(size=9, color="rgb(214,39,40)", symbol="x"),
    name="outlier",
)
# equal aspect; image coordinates have y growing downward
fig.update_yaxes(scaleanchor="x", scaleratio=1, autorange="reversed")
fig.update_layout(
    title_text=f"camera 'f': {len(sel)} of {n_pts} observations corrupted",
    height=420,
    margin=dict(l=0, r=0, t=40, b=0),
)
show(fig)

With the default loss="linear", cost is the sum of squared residuals, so a single 100 px outlier contributes as much pull as ten thousand 1 px inliers — the fit trades real accuracy everywhere to shave the few gross errors. loss="huber" (forwarded, like f_scale, to scipy.optimize.least_squares) is quadratic for residuals below f_scale and linear above, bounding each outlier's influence. A proper f_scale sits between the inlier noise level and the outlier magnitude: here noise is σ = 0.5 px and outliers start at 30 px, so f_scale = 1 works. We also fit with f_scale = 1000, which puts every residual in the quadratic regime and silently degenerates to the linear fit.

(An implementation note: scipy's native robust-loss handling rescales each Jacobian row by a second-order "Triggs" factor that is exactly zero for Huber residuals beyond f_scale, and the degenerate rows stall its sparse LSMR trust-region solver far from the optimum. bundle_adjust therefore swaps the string "huber" for an IRLS-form callable — identical cost and gradient, ρ″ = 0 — which keeps the subproblem well-posed; see deeperfly.bundle_adjustment.core._IRLS_LOSSES.)

In [10]:

Copied!





def evaluate(cams_fit, pts3d_fit):
    """Errors vs. the true projections / geometry (gauge aligned on the points)."""
    rmse = np.sqrt(np.nanmean((np.asarray(cams_fit.project(pts3d_fit)) - pts2d) ** 2))
    s, R, t = similarity_align(pts3d_fit, pts3d_gt)
    pt_err = np.linalg.norm(pts3d_fit @ (s * R).T + t - pts3d_gt, axis=-1)
    centers = camera_centers(cams_fit.rvecs, cams_fit.tvecs) @ (s * R).T + t
    return (
        rmse,
        np.median(pt_err),
        np.max(np.linalg.norm(centers - centers_gt, axis=-1)),
    )


header = f"{'fit':22s}{'RMSE vs true [px]':>19s}{'median point err':>18s}{'max center err':>16s}"
print(header + "\n" + "-" * len(header))
for label, kwargs in {
    "linear": {},
    "huber, f_scale=1": dict(loss="huber", f_scale=1.0),
    "huber, f_scale=1000": dict(loss="huber", f_scale=1000.0),
}.items():
    res, cams_fit, pts3d_fit = bundle_adjust(
        cams0, pts2d_corrupt, fixed=["*.intr"], max_nfev=2000, **kwargs
    )
    rmse, pt_err, ctr_err = evaluate(cams_fit, pts3d_fit)
    note = "" if res.status > 0 else "   <- hit max_nfev"
    print(f"{label:22s}{rmse:19.3f}{pt_err:18.4f}{ctr_err:16.3f}{note}")
def evaluate(cams_fit, pts3d_fit):
    """Errors vs. the true projections / geometry (gauge aligned on the points)."""
    rmse = np.sqrt(np.nanmean((np.asarray(cams_fit.project(pts3d_fit)) - pts2d) ** 2))
    s, R, t = similarity_align(pts3d_fit, pts3d_gt)
    pt_err = np.linalg.norm(pts3d_fit @ (s * R).T + t - pts3d_gt, axis=-1)
    centers = camera_centers(cams_fit.rvecs, cams_fit.tvecs) @ (s * R).T + t
    return (
        rmse,
        np.median(pt_err),
        np.max(np.linalg.norm(centers - centers_gt, axis=-1)),
    )


header = f"{'fit':22s}{'RMSE vs true [px]':>19s}{'median point err':>18s}{'max center err':>16s}"
print(header + "\n" + "-" * len(header))
for label, kwargs in {
    "linear": {},
    "huber, f_scale=1": dict(loss="huber", f_scale=1.0),
    "huber, f_scale=1000": dict(loss="huber", f_scale=1000.0),
}.items():
    res, cams_fit, pts3d_fit = bundle_adjust(
        cams0, pts2d_corrupt, fixed=["*.intr"], max_nfev=2000, **kwargs
    )
    rmse, pt_err, ctr_err = evaluate(cams_fit, pts3d_fit)
    note = "" if res.status > 0 else "   <- hit max_nfev"
    print(f"{label:22s}{rmse:19.3f}{pt_err:18.4f}{ctr_err:16.3f}{note}")

fit                     RMSE vs true [px]  median point err  max center err
---------------------------------------------------------------------------

linear                             10.679            0.0540          13.604

huber, f_scale=1                    0.371            0.0027           0.753

huber, f_scale=1000                10.668            0.0529          13.437

The linear fit converges, but to the wrong rig: the 10% of gross outliers leave a ~10 px error floor against the true projections and pull the camera centers visibly off. The Huber fit with f_scale=1 recovers the geometry down to the noise floor — a ~30× improvement — because the bounded outlier influence lets the 90% clean observations dominate. The f_scale=1000 row shows that the threshold matters: it exceeds every residual, so the "robust" fit is exactly the linear one.

In a run config the robust fit maps to flat keys under [bundle_adjustment]:

[bundle_adjustment]
loss = "huber"
f_scale = 1.0

One caveat: Huber bounds each outlier's pull but does not zero it, so a point whose observations are mostly corrupted is still dragged (the worst point here has 4 of its 7 views corrupted). For heavier contamination, downweight or drop suspect observations before fitting — bundle_adjust accepts per-observation weights, which the pipeline fills with detector confidences.