From 0990a14d7bd2c1b91614167199e1ba1ce9d8745e Mon Sep 17 00:00:00 2001
From: Dongdong Tian <seisman.info@gmail.com>
Date: Fri, 29 May 2026 15:22:00 +0800
Subject: [PATCH 1/2] Add the benchmark for a simple scatter plot

---
 README.md                               |  16 +-
 benchmarks/bench_matplotlib_diamonds.py | 212 ++++++++++++++++++++++++
 2 files changed, 227 insertions(+), 1 deletion(-)
 create mode 100644 benchmarks/bench_matplotlib_diamonds.py

diff --git a/README.md b/README.md
index 455dc2a..5b5edf7 100644
--- a/README.md
+++ b/README.md
@@ -12,7 +12,21 @@ conda env create -f environment.yml
 conda activate pygmt-benchmarking
 ```
 
+## Benchmarks
+
+The benchmarks were run on a MacBook Pro (Apple M5, 32GB RAM) running macOS Tahoe 26.1.
+
+### Benchmark 1: Diamonds Dataset
+
+The diamonds benchmark compares PyGMT and matplotlib when plotting the seaborn diamonds
+dataset.
+
+| Mode | matplotlib | PyGMT | Ratio |
+| --- | --- | --- | --- |
+| Plot | 0.0184s | 0.0299s | 0.6x |
+| Plot and Save | 0.1213s | 0.8311s | 0.15x |
+
 ## License
 
 This project is licensed under the MIT License. See the [LICENSE](LICENSE) file for
-details.
\ No newline at end of file
+details.
diff --git a/benchmarks/bench_matplotlib_diamonds.py b/benchmarks/bench_matplotlib_diamonds.py
new file mode 100644
index 0000000..45eaec5
--- /dev/null
+++ b/benchmarks/bench_matplotlib_diamonds.py
@@ -0,0 +1,212 @@
+"""
+Benchmark PyGMT and matplotlib when plotting the diamonds dataset.
+
+This benchmark can time either plot construction only or the complete plotting
+workflow: create a fresh figure, draw the scatter plot, add labels/legend, and write
+the figure to disk. Dataset loading is intentionally outside the timed section so the
+results focus on plotting and rendering rather than network or CSV parsing time.
+"""
+
+import argparse
+import statistics
+import time
+from pathlib import Path
+
+import pandas as pd
+import pygmt
+import matplotlib.pyplot as plt  # noqa: E402
+
+
+COLORS = ("#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd")
+BACKENDS = ("matplotlib", "pygmt")
+MODES = ("plot", "save")
+DIAMONDS_DATA_URL = (
+    "https://github.com/mwaskom/seaborn-data/raw/master/diamonds.csv"
+)
+CUT_ORDER = ("Fair", "Good", "Very Good", "Premium", "Ideal")
+
+# Matplotlib interprets scatter ``s`` as marker area in points squared, while PyGMT's
+# circle style uses marker diameter. Keep one diameter-like value here and convert
+# it for matplotlib inside ``plot_matplotlib``.
+MARKER_SIZE_POINTS = 2
+
+
+def load_diamonds_data() -> pd.DataFrame:
+    """Load the diamonds dataset from seaborn's example-data repository."""
+    return pd.read_csv(DIAMONDS_DATA_URL)
+
+
+def plot_matplotlib(data: pd.DataFrame, output: Path, save: bool) -> None:
+    """Create and render the diamonds scatter plot with matplotlib.
+
+    The function deliberately creates a new figure on each call so repeated runs
+    measure the full plotting workflow, matching the PyGMT function below. Set
+    ``save`` to false to measure plot construction without file export.
+    """
+    fig, ax = plt.subplots(figsize=(6, 4), dpi=300)
+
+    for cut_id, cut_name in enumerate(CUT_ORDER):
+        cut_data = data[data["cut"] == cut_name]
+        ax.scatter(
+            cut_data["carat"],
+            cut_data["price"],
+            label=cut_name,
+            s=MARKER_SIZE_POINTS**2,
+            marker="o",
+            color=COLORS[cut_id],
+            alpha=0.5,
+            linewidths=0,
+        )
+    ax.set_xlabel("Carat")
+    ax.set_ylabel("Price (USD)")
+    ax.set_title("Diamond price by carat")
+    ax.legend(title="Cut", frameon=False, markerscale=4)
+    if save:
+        fig.savefig(output)
+    plt.close(fig)
+
+
+def plot_pygmt(data: pd.DataFrame, output: Path, save: bool) -> None:
+    """Create and render the diamonds scatter plot with PyGMT.
+
+    PyGMT accepts tabular data directly. Passing only the two plotted columns keeps
+    this comparable to the x/y arrays handed to matplotlib. Set ``save`` to false to
+    measure plot construction without file export.
+    """
+    fig = pygmt.Figure()
+    fig.basemap(
+        region=[0, 5.5, 0, 20000],
+        projection="X6i/4i",
+        frame=pygmt.params.Frame(
+            axes="WSne",
+            title="Diamond price by carat",
+            xaxis=pygmt.params.Axis(annot=True, tick=True, label="Carat"),
+            yaxis=pygmt.params.Axis(annot=True, tick=True, label="Price (USD)"),
+        )
+    )
+
+    for cut_id, cut_name in enumerate(CUT_ORDER):
+        cut_data = data[data["cut"] == cut_name]
+        fig.plot(
+            x=cut_data["carat"],
+            y=cut_data["price"],
+            style=f"c{MARKER_SIZE_POINTS}p",
+            fill=f"{COLORS[cut_id]}@50",
+            label=cut_name,
+        )
+
+    fig.legend(position=pygmt.params.Position("TR", offset=0.1))
+    if save:
+        fig.savefig(output)
+
+
+def benchmark(
+    name: str,
+    plot_func,
+    data: pd.DataFrame,
+    output_dir: Path,
+    repeats: int,
+    save: bool,
+) -> list[float]:
+    """Time repeated plot creation and rendering runs.
+
+    The first call is an untimed warmup. It absorbs one-time backend setup such as
+    font discovery, GMT session initialization, and dynamic library loading.
+    """
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    # Warm up each backend once before recording timings.
+    plot_func(data, output_dir / f"{name}_warmup.png", save)
+
+    timings = []
+    for run_id in range(repeats):
+        output = output_dir / f"{name}_{run_id + 1}.png"
+        start = time.perf_counter()
+        plot_func(data, output, save)
+        timings.append(time.perf_counter() - start)
+
+    return timings
+
+
+def format_summary(name: str, timings: list[float]) -> str:
+    """Format benchmark timing statistics."""
+    mean = statistics.fmean(timings)
+    median = statistics.median(timings)
+    minimum = min(timings)
+    maximum = max(timings)
+    return (
+        f"{name:10s} "
+        f"mean={mean:.4f}s "
+        f"median={median:.4f}s "
+        f"min={minimum:.4f}s "
+        f"max={maximum:.4f}s"
+    )
+
+
+def parse_args() -> argparse.Namespace:
+    """Parse command-line arguments."""
+    parser = argparse.ArgumentParser(
+        description="Benchmark PyGMT and matplotlib diamonds scatter plotting."
+    )
+    parser.add_argument(
+        "--backend",
+        choices=(*BACKENDS, "all"),
+        default="all",
+        help="plotting backend to benchmark",
+    )
+    parser.add_argument(
+        "--repeats",
+        type=int,
+        default=10,
+        help="number of timed runs per backend",
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=Path,
+        default=Path("plots/diamonds"),
+        help="directory for rendered PNG files",
+    )
+    parser.add_argument(
+        "--mode",
+        choices=(*MODES, "both"),
+        default="both",
+        help="benchmark plot construction, plot-and-save, or both",
+    )
+    return parser.parse_args()
+
+
+def main() -> None:
+    """Run the diamonds plotting benchmark."""
+    args = parse_args()
+    if args.repeats < 1:
+        raise SystemExit("--repeats must be at least 1")
+
+    data = load_diamonds_data()
+    selected_backends = BACKENDS if args.backend == "all" else (args.backend,)
+    plotters = {
+        "matplotlib": plot_matplotlib,
+        "pygmt": plot_pygmt,
+    }
+    selected_modes = MODES if args.mode == "both" else (args.mode,)
+
+    print(f"Running {args.repeats} timed run(s) per backend")
+    if "save" in selected_modes:
+        print(f"Writing PNG files to {args.output_dir}")
+    for mode in selected_modes:
+        save = mode == "save"
+        print(f"Mode: {mode}")
+        for backend in selected_backends:
+            print(f"Benchmarking {backend}...", flush=True)
+            timings = benchmark(
+                name=f"{backend}_{mode}",
+                plot_func=plotters[backend],
+                data=data,
+                output_dir=args.output_dir,
+                repeats=args.repeats,
+                save=save,
+            )
+            print(format_summary(backend, timings))
+
+
+if __name__ == "__main__":
+    main()

From c434cad444e3c7d6f477e50e98078711b59e9389 Mon Sep 17 00:00:00 2001
From: Dongdong Tian <seisman.info@gmail.com>
Date: Fri, 29 May 2026 17:20:22 +0800
Subject: [PATCH 2/2] Simplify benchmarks

---
 README.md                               |  19 ++-
 benchmarks/bench_matplotlib_diamonds.py | 146 +++++++++---------------
 2 files changed, 67 insertions(+), 98 deletions(-)

diff --git a/README.md b/README.md
index 5b5edf7..2c69a44 100644
--- a/README.md
+++ b/README.md
@@ -14,17 +14,24 @@ conda activate pygmt-benchmarking
 
 ## Benchmarks
 
-The benchmarks were run on a MacBook Pro (Apple M5, 32GB RAM) running macOS Tahoe 26.1.
+The benchmarks were run on a MacBook Pro (Apple M5, 32GB RAM) running macOS
+Tahoe 26.1.
 
 ### Benchmark 1: Diamonds Dataset
 
-The diamonds benchmark compares PyGMT and matplotlib when plotting the seaborn diamonds
-dataset.
+The diamonds benchmark compares PyGMT and matplotlib when plotting the seaborn
+diamonds dataset. It reports plotting time and `savefig` time separately.
 
-| Mode | matplotlib | PyGMT | Ratio |
+Run the benchmark:
+
+```bash
+python benchmarks/bench_matplotlib_diamonds.py
+```
+
+| Step | matplotlib | PyGMT | PyGMT / matplotlib |
 | --- | --- | --- | --- |
-| Plot | 0.0184s | 0.0299s | 0.6x |
-| Plot and Save | 0.1213s | 0.8311s | 0.15x |
+| Plotting | 0.021 | 0.028 | 0.75x |
+| Savefig | 0.13 | 1.135 | 0.11x |
 
 ## License
 
diff --git a/benchmarks/bench_matplotlib_diamonds.py b/benchmarks/bench_matplotlib_diamonds.py
index 45eaec5..c527263 100644
--- a/benchmarks/bench_matplotlib_diamonds.py
+++ b/benchmarks/bench_matplotlib_diamonds.py
@@ -1,13 +1,7 @@
 """
 Benchmark PyGMT and matplotlib when plotting the diamonds dataset.
-
-This benchmark can time either plot construction only or the complete plotting
-workflow: create a fresh figure, draw the scatter plot, add labels/legend, and write
-the figure to disk. Dataset loading is intentionally outside the timed section so the
-results focus on plotting and rendering rather than network or CSV parsing time.
 """
 
-import argparse
 import statistics
 import time
 from pathlib import Path
@@ -19,7 +13,8 @@
 
 COLORS = ("#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd")
 BACKENDS = ("matplotlib", "pygmt")
-MODES = ("plot", "save")
+OUTPUT_DIR = Path("plots/diamonds")
+REPEATS = 10
 DIAMONDS_DATA_URL = (
     "https://github.com/mwaskom/seaborn-data/raw/master/diamonds.csv"
 )
@@ -36,13 +31,8 @@ def load_diamonds_data() -> pd.DataFrame:
     return pd.read_csv(DIAMONDS_DATA_URL)
 
 
-def plot_matplotlib(data: pd.DataFrame, output: Path, save: bool) -> None:
-    """Create and render the diamonds scatter plot with matplotlib.
-
-    The function deliberately creates a new figure on each call so repeated runs
-    measure the full plotting workflow, matching the PyGMT function below. Set
-    ``save`` to false to measure plot construction without file export.
-    """
+def plot_matplotlib(data: pd.DataFrame):
+    """Create the diamonds scatter plot with matplotlib."""
     fig, ax = plt.subplots(figsize=(6, 4), dpi=300)
 
     for cut_id, cut_name in enumerate(CUT_ORDER):
@@ -61,17 +51,17 @@ def plot_matplotlib(data: pd.DataFrame, output: Path, save: bool) -> None:
     ax.set_ylabel("Price (USD)")
     ax.set_title("Diamond price by carat")
     ax.legend(title="Cut", frameon=False, markerscale=4)
-    if save:
-        fig.savefig(output)
-    plt.close(fig)
+    return fig
+
 
+def save_matplotlib(fig, output: Path) -> None:
+    """Save a matplotlib figure and release it."""
+    fig.savefig(output)
+    plt.close(fig)
 
-def plot_pygmt(data: pd.DataFrame, output: Path, save: bool) -> None:
-    """Create and render the diamonds scatter plot with PyGMT.
 
-    PyGMT accepts tabular data directly. Passing only the two plotted columns keeps
-    this comparable to the x/y arrays handed to matplotlib. Set ``save`` to false to
-    measure plot construction without file export.
+def plot_pygmt(data: pd.DataFrame) -> pygmt.Figure:
+    """Create the diamonds scatter plot with PyGMT.
     """
     fig = pygmt.Figure()
     fig.basemap(
@@ -84,7 +74,6 @@ def plot_pygmt(data: pd.DataFrame, output: Path, save: bool) -> None:
             yaxis=pygmt.params.Axis(annot=True, tick=True, label="Price (USD)"),
         )
     )
-
     for cut_id, cut_name in enumerate(CUT_ORDER):
         cut_data = data[data["cut"] == cut_name]
         fig.plot(
@@ -94,38 +83,47 @@ def plot_pygmt(data: pd.DataFrame, output: Path, save: bool) -> None:
             fill=f"{COLORS[cut_id]}@50",
             label=cut_name,
         )
-
     fig.legend(position=pygmt.params.Position("TR", offset=0.1))
-    if save:
-        fig.savefig(output)
+    return fig
+
+
+def save_pygmt(fig: pygmt.Figure, output: Path) -> None:
+    """Save a PyGMT figure."""
+    fig.savefig(output)
 
 
 def benchmark(
     name: str,
     plot_func,
+    save_func,
     data: pd.DataFrame,
     output_dir: Path,
     repeats: int,
-    save: bool,
-) -> list[float]:
-    """Time repeated plot creation and rendering runs.
+) -> tuple[list[float], list[float]]:
+    """Time repeated plot creation and figure export runs.
 
-    The first call is an untimed warmup. It absorbs one-time backend setup such as
-    font discovery, GMT session initialization, and dynamic library loading.
+    The first call is an untimed warmup. It absorbs one-time backend setup such as font
+    discovery, and data downloading.
     """
     output_dir.mkdir(parents=True, exist_ok=True)
 
     # Warm up each backend once before recording timings.
-    plot_func(data, output_dir / f"{name}_warmup.png", save)
+    fig = plot_func(data)
+    save_func(fig, output_dir / f"{name}_warmup.png")
 
-    timings = []
+    plot_timings = []
+    save_timings = []
     for run_id in range(repeats):
         output = output_dir / f"{name}_{run_id + 1}.png"
+
         start = time.perf_counter()
-        plot_func(data, output, save)
-        timings.append(time.perf_counter() - start)
+        fig = plot_func(data)
+        plot_timings.append(time.perf_counter() - start)
+
+        save_func(fig, output)
+        save_timings.append(time.perf_counter() - start)
 
-    return timings
+    return plot_timings, save_timings
 
 
 def format_summary(name: str, timings: list[float]) -> str:
@@ -143,69 +141,33 @@ def format_summary(name: str, timings: list[float]) -> str:
     )
 
 
-def parse_args() -> argparse.Namespace:
-    """Parse command-line arguments."""
-    parser = argparse.ArgumentParser(
-        description="Benchmark PyGMT and matplotlib diamonds scatter plotting."
-    )
-    parser.add_argument(
-        "--backend",
-        choices=(*BACKENDS, "all"),
-        default="all",
-        help="plotting backend to benchmark",
-    )
-    parser.add_argument(
-        "--repeats",
-        type=int,
-        default=10,
-        help="number of timed runs per backend",
-    )
-    parser.add_argument(
-        "--output-dir",
-        type=Path,
-        default=Path("plots/diamonds"),
-        help="directory for rendered PNG files",
-    )
-    parser.add_argument(
-        "--mode",
-        choices=(*MODES, "both"),
-        default="both",
-        help="benchmark plot construction, plot-and-save, or both",
-    )
-    return parser.parse_args()
-
-
 def main() -> None:
     """Run the diamonds plotting benchmark."""
-    args = parse_args()
-    if args.repeats < 1:
-        raise SystemExit("--repeats must be at least 1")
-
     data = load_diamonds_data()
-    selected_backends = BACKENDS if args.backend == "all" else (args.backend,)
     plotters = {
         "matplotlib": plot_matplotlib,
         "pygmt": plot_pygmt,
     }
-    selected_modes = MODES if args.mode == "both" else (args.mode,)
-
-    print(f"Running {args.repeats} timed run(s) per backend")
-    if "save" in selected_modes:
-        print(f"Writing PNG files to {args.output_dir}")
-    for mode in selected_modes:
-        save = mode == "save"
-        print(f"Mode: {mode}")
-        for backend in selected_backends:
-            print(f"Benchmarking {backend}...", flush=True)
-            timings = benchmark(
-                name=f"{backend}_{mode}",
-                plot_func=plotters[backend],
-                data=data,
-                output_dir=args.output_dir,
-                repeats=args.repeats,
-                save=save,
-            )
-            print(format_summary(backend, timings))
+    savers = {
+        "matplotlib": save_matplotlib,
+        "pygmt": save_pygmt,
+    }
+
+    print(f"Running {REPEATS} timed run(s) per backend")
+    print(f"Writing PNG files to {OUTPUT_DIR}")
+
+    for backend in BACKENDS:
+        print(f"Benchmarking {backend}...", flush=True)
+        plot_timings, save_timings = benchmark(
+            name=backend,
+            plot_func=plotters[backend],
+            save_func=savers[backend],
+            data=data,
+            output_dir=OUTPUT_DIR,
+            repeats=REPEATS,
+        )
+        print(format_summary(f"{backend} plot", plot_timings))
+        print(format_summary(f"{backend} savefig", save_timings))
 
 
 if __name__ == "__main__":