The Problem with Gradient Descent and the Solution Using Momentum

PLOT_STEPS = 55
 
x_ = np.linspace(-5, 5, 500)
y_ = np.linspace(-2.2, 2.2, 500)
X and Y are equal to np.meshgrid() (x_ and y_).
Z    = loss(X, Y)
 
fig = plt.figure(figsize=(16, 10), facecolor="#FAFAF8")
GridSpec (2, 3); figure=fig; hspace=0.45,Colors =
               left=0.07, right=0.97, top=0.88, bottom=0.08)
 
COLORS = {
    "gd":        "#E05C4B",
    "mom_good":  "#3A7CA5",
    "mom_large": "#F4A536",
    "contour":   "#D4C9B8",
    "minima":    "#2A9D5C",
    "start":     "#444444",
}
 
PANEL_TITLES = [
    "Vanilla Gradient DescentnOscillates, slow  (185 steps to converge)",
    "Momentum  β = 0.90nSmooth, fast  (159 steps to converge)",
    "Momentum  β = 0.99 (too large)nOvershoots -- never converges",
]
 
paths_plot = [
    path_gd[:PLOT_STEPS+1],
    path_mom_good[:PLOT_STEPS+1],
    path_mom_large[:PLOT_STEPS+1],
]
colors = [COLORS["gd"]COLORS["mom_good"]COLORS["mom_large"]]
 
Top row: trajectory panels
for col, (path, color, title) in enumerate(zip(paths_plot, colors, PANEL_TITLES)):
 Ax = fig.add_subplot (gs[0, col])
    ax.set_facecolor("#F5F3EE")
 
 level = np.geomspace (0, 5, 3, 28, 0.005)
    ax.contour(X, Y, Z, levels=levels, colors=COLORS["contour"],
               linewidths=0.7, alpha=0.9)
 
    ax.plot(path[:, 0]Path[:, 1], color=color, lw=1.8, alpha=0.85, zorder=3)
    ax.scatter(path[:, 0]Path[:, 1], color=color, s=18, zorder=4, alpha=0.6)
 
    ax.scatter(*path[0],  marker="o", s=90,  color=COLORS["start"],  zorder=5, label="start")
    ax.scatter(*path[-1], marker="*", s=120, color=COLORS["minima"], zorder=5, label="end")
    ax.scatter(0, 0, marker="+", s=200, color=COLORS["minima"], linewidths=2.5, zorder=6)
 
    ax.set_xlim(-5, 5)
    ax.set_ylim(-2.2, 2.2)
    ax.set_title(title, fontsize=9.5, fontweight="bold", color="#222", pad=7, loc="left")
    ax.set_xlabel("θ₁  (slow direction)", fontsize=8, color="#666")
    ax.set_ylabel("θ₂  (fast direction)", fontsize=8, color="#666")
    ax.tick_params(labelsize=7, colors="#888")
 Spines.values.for ax.():
        spine.set_edgecolor("#CCCCCC")
 
The loss curves are shown at the bottom left (300 steps).
ax_loss = fig.add_subplot(gs[1, :2])
ax_loss.set_facecolor("#F5F3EE")
 
full_paths  = [path_gd, path_mom_good, path_mom_large]
full_labels = ["Vanilla GD  (185 steps)", "Momentum β=0.90  (159 steps)", "Momentum β=0.99  (diverges)"]
 
for path, color, label in zip(full_paths, colors, full_labels):
    losses = [loss(*p) for p in path]
    steps_range = np.arange(len(path))
    ax_loss.plot(steps_range, losses, color=color, lw=2, label=label, alpha=0.9)
 
ax_loss.axhline(0.001, color="#999", lw=1, ls="--", alpha=0.6)
ax_loss.text(305, 0.001, "convergencenthreshold", fontsize=7, color="#888", va="center")
 
ax_loss.set_yscale("log")
ax_loss.set_xlim(0, STEPS)
ax_loss.set_title("Loss vs. Optimisation Step  (log scale, 300 steps)",
                  fontsize=10.5, fontweight="bold", color="#222", loc="left")
ax_loss.set_xlabel("Step", fontsize=9, color="#666")
ax_loss.set_ylabel("Loss  f(θ)", fontsize=9, color="#666")
ax_loss.legend(fontsize=8.5, framealpha=0.6)
ax_loss.tick_params(labelsize=8, colors="#888")
for spine in ax_loss.spines.values():
    spine.set_edgecolor("#CCCCCC")
 
Annotation panel # Bottom-right
ax_ann = fig.add_subplot(gs[1, 2])
ax_ann.set_facecolor("#F5F3EE")
ax_ann.axis("off")
 

    "Update rulesnn"
    "Vanilla GDn"
    "  θ ← θ − α·∇L(θ)nn"
    "Momentum GDn"
    "  v ← β·v + (1−β)·∇L(θ)n"
    "  θ ← θ − α·vnn"
    "Key intuitionn"
    "  v accumulates past gradients.n"
    "  Vertical oscillations cancel out.n"
    "  Horizontal steps compound.nn"
    "Hyperparameter βn"
    "  β → 0  :  behaves like GDn"
    "  β = 0.9:  typical sweet spotn"
    "  β → 1  :  overshoots / diverges"
)
ax_ann.text(0.05, 0.97, annotation, transform=ax_ann.transAxes,
            fontsize=8.8, va="top", ha="left",
            fontfamily="monospace", color="#333", linespacing=1.7)
 
fig.suptitle("Momentum in Gradient Descent",
             fontsize=16, fontweight="bold", color="#111", y=0.95)
 
plt.savefig("momentum_explainer.png", dpi=150, bbox_inches="tight",
            facecolor=fig.get_facecolor())
plt.show()

The Problem with Gradient Descent and the Solution Using Momentum

Google adds event-driven webhooks to its Gemini API, eliminating the need for polling during long-running AI jobs

Methods to Construct an Finish-to-Finish Manufacturing Grade Machine Studying Pipeline with ZenML, Together with Customized Materializers, Metadata Monitoring, and Hyperparameter Optimization

Zyphra’s new Tensor and sequence Parallelism (TSP), a hardware-aware strategy for training and inference, delivers up to 2.6x the throughput of matched TP+SP benchmarks

The Top Fetch and Search APIs to Build AI Agents by 2026: Trade-offs, Free Tiers and Tools

Gear News of the week: Another AI Browser and Fujifilm’s X-T30 III debut.

RentAHuman: I was hired by AI agents to promote their startups

The Supreme AI System – AI Weblog

The FBI can access your push notifications

Meet the Gods in AI Warfare

Top Insights

OpenAI hires 4 high-ranking engineers from Tesla, xAI and Meta

How do you build a RAG system with dynamic strategy and smart retrieval?

Latest News

Google DeepMind Workers vote for unionization over AI military deals

The Problem with Gradient Descent and the Solution Using Momentum

The Problem with Gradient Descent and the Solution Using Momentum

Related Posts