From 0990a14d7bd2c1b91614167199e1ba1ce9d8745e Mon Sep 17 00:00:00 2001 From: Dongdong Tian Date: Fri, 29 May 2026 15:22:00 +0800 Subject: [PATCH 1/2] Add the benchmark for a simple scatter plot --- README.md | 16 +- benchmarks/bench_matplotlib_diamonds.py | 212 ++++++++++++++++++++++++ 2 files changed, 227 insertions(+), 1 deletion(-) create mode 100644 benchmarks/bench_matplotlib_diamonds.py diff --git a/README.md b/README.md index 455dc2a..5b5edf7 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,21 @@ conda env create -f environment.yml conda activate pygmt-benchmarking ``` +## Benchmarks + +The benchmarks were run on a MacBook Pro (Apple M5, 32GB RAM) running macOS Tahoe 26.1. + +### Benchmark 1: Diamonds Dataset + +The diamonds benchmark compares PyGMT and matplotlib when plotting the seaborn diamonds +dataset. + +| Mode | matplotlib | PyGMT | Ratio | +| --- | --- | --- | --- | +| Plot | 0.0184s | 0.0299s | 0.6x | +| Plot and Save | 0.1213s | 0.8311s | 0.15x | + ## License This project is licensed under the MIT License. See the [LICENSE](LICENSE) file for -details. \ No newline at end of file +details. diff --git a/benchmarks/bench_matplotlib_diamonds.py b/benchmarks/bench_matplotlib_diamonds.py new file mode 100644 index 0000000..45eaec5 --- /dev/null +++ b/benchmarks/bench_matplotlib_diamonds.py @@ -0,0 +1,212 @@ +""" +Benchmark PyGMT and matplotlib when plotting the diamonds dataset. + +This benchmark can time either plot construction only or the complete plotting +workflow: create a fresh figure, draw the scatter plot, add labels/legend, and write +the figure to disk. Dataset loading is intentionally outside the timed section so the +results focus on plotting and rendering rather than network or CSV parsing time. +""" + +import argparse +import statistics +import time +from pathlib import Path + +import pandas as pd +import pygmt +import matplotlib.pyplot as plt # noqa: E402 + + +COLORS = ("#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd") +BACKENDS = ("matplotlib", "pygmt") +MODES = ("plot", "save") +DIAMONDS_DATA_URL = ( + "https://github.com/mwaskom/seaborn-data/raw/master/diamonds.csv" +) +CUT_ORDER = ("Fair", "Good", "Very Good", "Premium", "Ideal") + +# Matplotlib interprets scatter ``s`` as marker area in points squared, while PyGMT's +# circle style uses marker diameter. Keep one diameter-like value here and convert +# it for matplotlib inside ``plot_matplotlib``. +MARKER_SIZE_POINTS = 2 + + +def load_diamonds_data() -> pd.DataFrame: + """Load the diamonds dataset from seaborn's example-data repository.""" + return pd.read_csv(DIAMONDS_DATA_URL) + + +def plot_matplotlib(data: pd.DataFrame, output: Path, save: bool) -> None: + """Create and render the diamonds scatter plot with matplotlib. + + The function deliberately creates a new figure on each call so repeated runs + measure the full plotting workflow, matching the PyGMT function below. Set + ``save`` to false to measure plot construction without file export. + """ + fig, ax = plt.subplots(figsize=(6, 4), dpi=300) + + for cut_id, cut_name in enumerate(CUT_ORDER): + cut_data = data[data["cut"] == cut_name] + ax.scatter( + cut_data["carat"], + cut_data["price"], + label=cut_name, + s=MARKER_SIZE_POINTS**2, + marker="o", + color=COLORS[cut_id], + alpha=0.5, + linewidths=0, + ) + ax.set_xlabel("Carat") + ax.set_ylabel("Price (USD)") + ax.set_title("Diamond price by carat") + ax.legend(title="Cut", frameon=False, markerscale=4) + if save: + fig.savefig(output) + plt.close(fig) + + +def plot_pygmt(data: pd.DataFrame, output: Path, save: bool) -> None: + """Create and render the diamonds scatter plot with PyGMT. + + PyGMT accepts tabular data directly. Passing only the two plotted columns keeps + this comparable to the x/y arrays handed to matplotlib. Set ``save`` to false to + measure plot construction without file export. + """ + fig = pygmt.Figure() + fig.basemap( + region=[0, 5.5, 0, 20000], + projection="X6i/4i", + frame=pygmt.params.Frame( + axes="WSne", + title="Diamond price by carat", + xaxis=pygmt.params.Axis(annot=True, tick=True, label="Carat"), + yaxis=pygmt.params.Axis(annot=True, tick=True, label="Price (USD)"), + ) + ) + + for cut_id, cut_name in enumerate(CUT_ORDER): + cut_data = data[data["cut"] == cut_name] + fig.plot( + x=cut_data["carat"], + y=cut_data["price"], + style=f"c{MARKER_SIZE_POINTS}p", + fill=f"{COLORS[cut_id]}@50", + label=cut_name, + ) + + fig.legend(position=pygmt.params.Position("TR", offset=0.1)) + if save: + fig.savefig(output) + + +def benchmark( + name: str, + plot_func, + data: pd.DataFrame, + output_dir: Path, + repeats: int, + save: bool, +) -> list[float]: + """Time repeated plot creation and rendering runs. + + The first call is an untimed warmup. It absorbs one-time backend setup such as + font discovery, GMT session initialization, and dynamic library loading. + """ + output_dir.mkdir(parents=True, exist_ok=True) + + # Warm up each backend once before recording timings. + plot_func(data, output_dir / f"{name}_warmup.png", save) + + timings = [] + for run_id in range(repeats): + output = output_dir / f"{name}_{run_id + 1}.png" + start = time.perf_counter() + plot_func(data, output, save) + timings.append(time.perf_counter() - start) + + return timings + + +def format_summary(name: str, timings: list[float]) -> str: + """Format benchmark timing statistics.""" + mean = statistics.fmean(timings) + median = statistics.median(timings) + minimum = min(timings) + maximum = max(timings) + return ( + f"{name:10s} " + f"mean={mean:.4f}s " + f"median={median:.4f}s " + f"min={minimum:.4f}s " + f"max={maximum:.4f}s" + ) + + +def parse_args() -> argparse.Namespace: + """Parse command-line arguments.""" + parser = argparse.ArgumentParser( + description="Benchmark PyGMT and matplotlib diamonds scatter plotting." + ) + parser.add_argument( + "--backend", + choices=(*BACKENDS, "all"), + default="all", + help="plotting backend to benchmark", + ) + parser.add_argument( + "--repeats", + type=int, + default=10, + help="number of timed runs per backend", + ) + parser.add_argument( + "--output-dir", + type=Path, + default=Path("plots/diamonds"), + help="directory for rendered PNG files", + ) + parser.add_argument( + "--mode", + choices=(*MODES, "both"), + default="both", + help="benchmark plot construction, plot-and-save, or both", + ) + return parser.parse_args() + + +def main() -> None: + """Run the diamonds plotting benchmark.""" + args = parse_args() + if args.repeats < 1: + raise SystemExit("--repeats must be at least 1") + + data = load_diamonds_data() + selected_backends = BACKENDS if args.backend == "all" else (args.backend,) + plotters = { + "matplotlib": plot_matplotlib, + "pygmt": plot_pygmt, + } + selected_modes = MODES if args.mode == "both" else (args.mode,) + + print(f"Running {args.repeats} timed run(s) per backend") + if "save" in selected_modes: + print(f"Writing PNG files to {args.output_dir}") + for mode in selected_modes: + save = mode == "save" + print(f"Mode: {mode}") + for backend in selected_backends: + print(f"Benchmarking {backend}...", flush=True) + timings = benchmark( + name=f"{backend}_{mode}", + plot_func=plotters[backend], + data=data, + output_dir=args.output_dir, + repeats=args.repeats, + save=save, + ) + print(format_summary(backend, timings)) + + +if __name__ == "__main__": + main() From c434cad444e3c7d6f477e50e98078711b59e9389 Mon Sep 17 00:00:00 2001 From: Dongdong Tian Date: Fri, 29 May 2026 17:20:22 +0800 Subject: [PATCH 2/2] Simplify benchmarks --- README.md | 19 ++- benchmarks/bench_matplotlib_diamonds.py | 146 +++++++++--------------- 2 files changed, 67 insertions(+), 98 deletions(-) diff --git a/README.md b/README.md index 5b5edf7..2c69a44 100644 --- a/README.md +++ b/README.md @@ -14,17 +14,24 @@ conda activate pygmt-benchmarking ## Benchmarks -The benchmarks were run on a MacBook Pro (Apple M5, 32GB RAM) running macOS Tahoe 26.1. +The benchmarks were run on a MacBook Pro (Apple M5, 32GB RAM) running macOS +Tahoe 26.1. ### Benchmark 1: Diamonds Dataset -The diamonds benchmark compares PyGMT and matplotlib when plotting the seaborn diamonds -dataset. +The diamonds benchmark compares PyGMT and matplotlib when plotting the seaborn +diamonds dataset. It reports plotting time and `savefig` time separately. -| Mode | matplotlib | PyGMT | Ratio | +Run the benchmark: + +```bash +python benchmarks/bench_matplotlib_diamonds.py +``` + +| Step | matplotlib | PyGMT | PyGMT / matplotlib | | --- | --- | --- | --- | -| Plot | 0.0184s | 0.0299s | 0.6x | -| Plot and Save | 0.1213s | 0.8311s | 0.15x | +| Plotting | 0.021 | 0.028 | 0.75x | +| Savefig | 0.13 | 1.135 | 0.11x | ## License diff --git a/benchmarks/bench_matplotlib_diamonds.py b/benchmarks/bench_matplotlib_diamonds.py index 45eaec5..c527263 100644 --- a/benchmarks/bench_matplotlib_diamonds.py +++ b/benchmarks/bench_matplotlib_diamonds.py @@ -1,13 +1,7 @@ """ Benchmark PyGMT and matplotlib when plotting the diamonds dataset. - -This benchmark can time either plot construction only or the complete plotting -workflow: create a fresh figure, draw the scatter plot, add labels/legend, and write -the figure to disk. Dataset loading is intentionally outside the timed section so the -results focus on plotting and rendering rather than network or CSV parsing time. """ -import argparse import statistics import time from pathlib import Path @@ -19,7 +13,8 @@ COLORS = ("#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd") BACKENDS = ("matplotlib", "pygmt") -MODES = ("plot", "save") +OUTPUT_DIR = Path("plots/diamonds") +REPEATS = 10 DIAMONDS_DATA_URL = ( "https://github.com/mwaskom/seaborn-data/raw/master/diamonds.csv" ) @@ -36,13 +31,8 @@ def load_diamonds_data() -> pd.DataFrame: return pd.read_csv(DIAMONDS_DATA_URL) -def plot_matplotlib(data: pd.DataFrame, output: Path, save: bool) -> None: - """Create and render the diamonds scatter plot with matplotlib. - - The function deliberately creates a new figure on each call so repeated runs - measure the full plotting workflow, matching the PyGMT function below. Set - ``save`` to false to measure plot construction without file export. - """ +def plot_matplotlib(data: pd.DataFrame): + """Create the diamonds scatter plot with matplotlib.""" fig, ax = plt.subplots(figsize=(6, 4), dpi=300) for cut_id, cut_name in enumerate(CUT_ORDER): @@ -61,17 +51,17 @@ def plot_matplotlib(data: pd.DataFrame, output: Path, save: bool) -> None: ax.set_ylabel("Price (USD)") ax.set_title("Diamond price by carat") ax.legend(title="Cut", frameon=False, markerscale=4) - if save: - fig.savefig(output) - plt.close(fig) + return fig + +def save_matplotlib(fig, output: Path) -> None: + """Save a matplotlib figure and release it.""" + fig.savefig(output) + plt.close(fig) -def plot_pygmt(data: pd.DataFrame, output: Path, save: bool) -> None: - """Create and render the diamonds scatter plot with PyGMT. - PyGMT accepts tabular data directly. Passing only the two plotted columns keeps - this comparable to the x/y arrays handed to matplotlib. Set ``save`` to false to - measure plot construction without file export. +def plot_pygmt(data: pd.DataFrame) -> pygmt.Figure: + """Create the diamonds scatter plot with PyGMT. """ fig = pygmt.Figure() fig.basemap( @@ -84,7 +74,6 @@ def plot_pygmt(data: pd.DataFrame, output: Path, save: bool) -> None: yaxis=pygmt.params.Axis(annot=True, tick=True, label="Price (USD)"), ) ) - for cut_id, cut_name in enumerate(CUT_ORDER): cut_data = data[data["cut"] == cut_name] fig.plot( @@ -94,38 +83,47 @@ def plot_pygmt(data: pd.DataFrame, output: Path, save: bool) -> None: fill=f"{COLORS[cut_id]}@50", label=cut_name, ) - fig.legend(position=pygmt.params.Position("TR", offset=0.1)) - if save: - fig.savefig(output) + return fig + + +def save_pygmt(fig: pygmt.Figure, output: Path) -> None: + """Save a PyGMT figure.""" + fig.savefig(output) def benchmark( name: str, plot_func, + save_func, data: pd.DataFrame, output_dir: Path, repeats: int, - save: bool, -) -> list[float]: - """Time repeated plot creation and rendering runs. +) -> tuple[list[float], list[float]]: + """Time repeated plot creation and figure export runs. - The first call is an untimed warmup. It absorbs one-time backend setup such as - font discovery, GMT session initialization, and dynamic library loading. + The first call is an untimed warmup. It absorbs one-time backend setup such as font + discovery, and data downloading. """ output_dir.mkdir(parents=True, exist_ok=True) # Warm up each backend once before recording timings. - plot_func(data, output_dir / f"{name}_warmup.png", save) + fig = plot_func(data) + save_func(fig, output_dir / f"{name}_warmup.png") - timings = [] + plot_timings = [] + save_timings = [] for run_id in range(repeats): output = output_dir / f"{name}_{run_id + 1}.png" + start = time.perf_counter() - plot_func(data, output, save) - timings.append(time.perf_counter() - start) + fig = plot_func(data) + plot_timings.append(time.perf_counter() - start) + + save_func(fig, output) + save_timings.append(time.perf_counter() - start) - return timings + return plot_timings, save_timings def format_summary(name: str, timings: list[float]) -> str: @@ -143,69 +141,33 @@ def format_summary(name: str, timings: list[float]) -> str: ) -def parse_args() -> argparse.Namespace: - """Parse command-line arguments.""" - parser = argparse.ArgumentParser( - description="Benchmark PyGMT and matplotlib diamonds scatter plotting." - ) - parser.add_argument( - "--backend", - choices=(*BACKENDS, "all"), - default="all", - help="plotting backend to benchmark", - ) - parser.add_argument( - "--repeats", - type=int, - default=10, - help="number of timed runs per backend", - ) - parser.add_argument( - "--output-dir", - type=Path, - default=Path("plots/diamonds"), - help="directory for rendered PNG files", - ) - parser.add_argument( - "--mode", - choices=(*MODES, "both"), - default="both", - help="benchmark plot construction, plot-and-save, or both", - ) - return parser.parse_args() - - def main() -> None: """Run the diamonds plotting benchmark.""" - args = parse_args() - if args.repeats < 1: - raise SystemExit("--repeats must be at least 1") - data = load_diamonds_data() - selected_backends = BACKENDS if args.backend == "all" else (args.backend,) plotters = { "matplotlib": plot_matplotlib, "pygmt": plot_pygmt, } - selected_modes = MODES if args.mode == "both" else (args.mode,) - - print(f"Running {args.repeats} timed run(s) per backend") - if "save" in selected_modes: - print(f"Writing PNG files to {args.output_dir}") - for mode in selected_modes: - save = mode == "save" - print(f"Mode: {mode}") - for backend in selected_backends: - print(f"Benchmarking {backend}...", flush=True) - timings = benchmark( - name=f"{backend}_{mode}", - plot_func=plotters[backend], - data=data, - output_dir=args.output_dir, - repeats=args.repeats, - save=save, - ) - print(format_summary(backend, timings)) + savers = { + "matplotlib": save_matplotlib, + "pygmt": save_pygmt, + } + + print(f"Running {REPEATS} timed run(s) per backend") + print(f"Writing PNG files to {OUTPUT_DIR}") + + for backend in BACKENDS: + print(f"Benchmarking {backend}...", flush=True) + plot_timings, save_timings = benchmark( + name=backend, + plot_func=plotters[backend], + save_func=savers[backend], + data=data, + output_dir=OUTPUT_DIR, + repeats=REPEATS, + ) + print(format_summary(f"{backend} plot", plot_timings)) + print(format_summary(f"{backend} savefig", save_timings)) if __name__ == "__main__":