Mixed-precision and accuracy settings

We test various options of KeOps regarding accuracy of computations.

Setup

output_filename = "accuracy"

import importlib
import os
import time

import numpy as np
import torch
from matplotlib import pyplot as plt

use_cuda = torch.cuda.is_available()

D = 3

Benchmark specifications:

MAXTIME = 10 if use_cuda else 1  # Max number of seconds before we break the loop
REDTIME = (
    2 if use_cuda else 0.2
)  # Decrease the number of runs if computations take longer than 2s...

# Number of samples that we'll loop upon
NS = [
    100,
    200,
    500,
    1000,
    2000,
    5000,
    10000,
    20000,
    50000,
    100000,
    200000,
    500000,
    1000000,
]

Synthetic dataset.

def generate_samples(N, D, device, lang):
    """Create point clouds sampled non-uniformly on a sphere of diameter 1."""

    if lang == "torch":
        if device == "cuda":
            torch.cuda.manual_seed_all(123)
        else:
            torch.manual_seed(123)

        x = torch.rand((N, D), device=device, dtype=torch.float64)
        y = torch.rand((N, D), device=device, dtype=torch.float64)
        # Draw a random source signal:
        b = torch.randn((N, 1), device=device, dtype=torch.float64)

    else:
        np.random.seed(1234)

        x = np.random.randn(*((N, D)))
        y = np.random.randn(*((N, D)))
        b = np.random.randn(*((N,)))

    return x, y, b

Define a simple RBF product, using the pykeops.torch.LazyTensor wrapper:

from pykeops.torch import LazyTensor


def conv_lazytensor(x, y, b, dtype, dtype_acc, sum_scheme, use_fast_math):
    backend = "GPU" if use_cuda else "CPU"
    x_i = LazyTensor(x.unsqueeze(-2))  # (M, 1, D)
    y_j = LazyTensor(y.unsqueeze(-3))  # (1, N, D)
    K_ij = ((x_i - y_j) ** 2).sum(-1)  # (M, N, 1)
    S_ij = K_ij * b.unsqueeze(-3)  # (M, N, 1) * (1, N, 1)
    return S_ij.sum(
        dim=1,
        backend=backend,
        dtype_acc=dtype_acc,
        sum_scheme=sum_scheme,
        use_fast_math=use_fast_math,
    )

Benchmarking loops

def benchmark(
    Routine, dev, N, D, loops, lang, dtype, dtype_acc, sum_scheme, use_fast_math
):
    """Times a convolution on an N-by-N problem, and evaluate accuracy."""

    device = torch.device(dev)
    x_, y_, b_ = generate_samples(N, D, device, lang)
    if dtype == "float16":
        torch_dtype = torch.float16
    if dtype == "float32":
        torch_dtype = torch.float32
    elif dtype == "float64":
        torch_dtype = torch.float64
    x, y, b = x_.to(torch_dtype), y_.to(torch_dtype), b_.to(torch_dtype)

    # We simply benchmark a convolution

    N0 = min(N, 100)
    Routine(
        x[:N0, :], y[:N0, :], b[:N0, :], dtype, dtype_acc, sum_scheme, use_fast_math
    )  # Warmup run, to compile and load everything

    # timings
    if loops > 0:
        code = "out = Routine( x, y, b, dtype, dtype_acc, sum_scheme, use_fast_math ) "
        t_0 = time.perf_counter()  # Actual benchmark --------------------
        if use_cuda:
            torch.cuda.synchronize()
        for i in range(loops):
            exec(code, locals())
        if use_cuda:
            torch.cuda.synchronize()
        elapsed = time.perf_counter() - t_0  # ---------------------------
        elapsed /= loops
        print(
            "timing of {:3} NxN convolution(s), with N ={:7}: {:3}x{:3.6f}s".format(
                loops, N, loops, elapsed / loops
            )
        )
    else:
        elapsed = np.NaN

    # accuracy
    ind = torch.randperm(y.shape[0])
    M = min(
        N, 1000
    )  # we evaluate accuracy on a subsample of outputs only because computations with full precisions are slow.
    out = Routine(
        x[:M, :], y[ind, :], b[ind, :], dtype, dtype_acc, sum_scheme, use_fast_math
    )
    ref_out = Routine(x_[:M, :], y_, b_, "float64", "float64", "kahan_scheme", False)
    mean_err = (
        (out.double() - ref_out.double()).abs().mean() / ref_out.double().abs().mean()
    ).item()
    mean_err = float("NaN") if mean_err == 0 else mean_err
    max_err = (
        (out.double() - ref_out.double()).abs().max() / ref_out.double().abs().mean()
    ).item()
    max_err = float("NaN") if max_err == 0 else max_err
    print(
        "accuracy of an MxN convolution, with M = {}, N ={:7}: mean err={:.1e}, max err={:.1e}".format(
            M, N, mean_err, max_err
        )
    )

    return elapsed, mean_err, max_err


def bench_config(
    Routine, backend, dev, lang, dtype, dtype_acc, sum_scheme, use_fast_math
):
    """Times a convolution for an increasing number of samples."""

    print(
        "Backend : {}, Device : {}, dtype : {}, dtype_acc : {}, sum_scheme : {}, use_fast_math : {} -------------".format(
            backend, dev, dtype, dtype_acc, sum_scheme, use_fast_math
        )
    )

    times = []
    mean_errs = []
    max_errs = []

    try:
        Nloops = [100, 10, 1, 0]
        nloops = Nloops.pop(0)
        for n in NS:
            elapsed, mean_err, max_err = benchmark(
                Routine,
                dev,
                n,
                D,
                nloops,
                lang,
                dtype,
                dtype_acc,
                sum_scheme,
                use_fast_math,
            )
            times.append(elapsed)
            mean_errs.append(mean_err)
            max_errs.append(max_err)
            if nloops > 0:
                if (nloops * elapsed > MAXTIME) or (
                    nloops * elapsed > REDTIME / 10 and nloops > 1
                ):
                    nloops = Nloops.pop(0)

    except RuntimeError:
        print("**\nMemory overflow !")
    except IndexError:
        print("**\nToo slow !")

    fill_nans = (len(NS) - len(times)) * [np.nan]
    return times + fill_nans, mean_errs + fill_nans, max_errs + fill_nans


def full_bench(title, routines):
    backends = [backend for (_, backend, _, _, _, _, _) in routines]

    print("Benchmarking : {} ===============================".format(title))

    lines_times = [NS]
    lines_mean_errs = [NS]
    lines_max_errs = [NS]
    for routine, backend, lang, dtype, dtype_acc, sum_scheme, use_fast_math in routines:
        res = bench_config(
            routine,
            backend,
            "cuda" if use_cuda else "cpu",
            lang,
            dtype,
            dtype_acc,
            sum_scheme,
            use_fast_math,
        )
        lines_times.append(res[0])
        lines_mean_errs.append(res[1])
        lines_max_errs.append(res[2])

    benches_times = np.array(lines_times).T
    benches_mean_errs = np.array(lines_mean_errs).T
    benches_max_errs = np.array(lines_max_errs).T

    for ind_benches, benches in enumerate(
        (benches_times, benches_mean_errs, benches_max_errs)
    ):
        # Creates a pyplot figure:
        plt.figure(figsize=(12, 8))
        linestyles = [
            "o-",
            "s-",
            "^-",
            "<-",
            ">-",
            "v-",
            "1-",
            "+-",
            "*-",
            "x-",
            "p-",
            "d-",
        ]
        for i, config in enumerate(routines):
            plt.plot(
                benches[:, 0],
                benches[:, i + 1],
                linestyles[i],
                linewidth=2,
                label='config = "{}"'.format(config[3:]),
            )

        plt.xlabel("Number of samples")
        if ind_benches == 0:
            plt.title("Runtimes for {} in dimension {}".format(title, D))
            plt.ylabel("Seconds")
        elif ind_benches == 1:
            plt.title("Mean errors for {} in dimension {}".format(title, D))
            plt.ylabel("Relative mean error")
        elif ind_benches == 2:
            plt.title("Max errors for {} in dimension {}".format(title, D))
            plt.ylabel("Relative max error")
        plt.yscale("log")
        plt.xscale("log")
        plt.legend(loc="upper left")
        plt.grid(True, which="major", linestyle="-")
        plt.grid(True, which="minor", linestyle="dotted")
        true_vals = benches[:, 1:].flatten()
        true_vals = true_vals[np.isfinite(true_vals)]
        if ind_benches == 0:
            plt.axis([NS[0], NS[-1], true_vals.min(), MAXTIME])
        else:
            plt.axis([NS[0], NS[-1], true_vals.min(), 100 * true_vals.max()])
        plt.tight_layout()

        # Save as a .csv to put a nice Tikz figure in the papers:
        header = "Npoints " + " ".join(backends)
        os.makedirs("output", exist_ok=True)
        np.savetxt(
            "output/" + output_filename + "_" + str(ind_benches) + ".csv",
            benches,
            fmt="%-9.5f",
            header=header,
            comments="",
        )

KeOps

routines = [
    (
        conv_lazytensor,
        "float16, direct_sum",
        "torch",
        "float16",
        "float16",
        "direct_sum",
        True,
    ),
    (
        conv_lazytensor,
        "float16, block_sum",
        "torch",
        "float16",
        "float16",
        "block_sum",
        True,
    ),
    (
        conv_lazytensor,
        "float16, kahan_scheme",
        "torch",
        "float16",
        "float16",
        "kahan_scheme",
        True,
    ),
    (
        conv_lazytensor,
        "float16, float32 acc",
        "torch",
        "float16",
        "float32",
        "block_sum",
        True,
    ),
    (
        conv_lazytensor,
        "float32, direct_sum",
        "torch",
        "float32",
        "float32",
        "direct_sum",
        True,
    ),
    (
        conv_lazytensor,
        "float32, block_sum",
        "torch",
        "float32",
        "float32",
        "block_sum",
        True,
    ),
    (
        conv_lazytensor,
        "float32, block_sum",
        "torch",
        "float32",
        "float32",
        "block_sum",
        False,
    ),
    (
        conv_lazytensor,
        "float32, kahan_scheme",
        "torch",
        "float32",
        "float32",
        "kahan_scheme",
        True,
    ),
    (
        conv_lazytensor,
        "float32, float64 acc",
        "torch",
        "float32",
        "float64",
        "block_sum",
        True,
    ),
    (
        conv_lazytensor,
        "float64, direct_sum",
        "torch",
        "float64",
        "float64",
        "direct_sum",
        True,
    ),
    (
        conv_lazytensor,
        "float64, block_sum",
        "torch",
        "float64",
        "float64",
        "block_sum",
        True,
    ),
    (
        conv_lazytensor,
        "float64, kahan_scheme",
        "torch",
        "float64",
        "float64",
        "kahan_scheme",
        True,
    ),
]
full_bench(" Matrix-Vector products", routines)

plt.show()

Benchmarking :  Matrix-Vector products ===============================
Backend : float16, direct_sum, Device : cuda, dtype : float16, dtype_acc : float16, sum_scheme : direct_sum, use_fast_math : True -------------
timing of 100 NxN convolution(s), with N =    100: 100x0.000005s
accuracy of an MxN convolution, with M = 100, N =    100: mean err=1.5e-03, max err=6.9e-03
timing of 100 NxN convolution(s), with N =    200: 100x0.000005s
accuracy of an MxN convolution, with M = 200, N =    200: mean err=3.6e-03, max err=1.7e-02
timing of 100 NxN convolution(s), with N =    500: 100x0.000005s
accuracy of an MxN convolution, with M = 500, N =    500: mean err=4.2e-03, max err=2.9e-02
timing of 100 NxN convolution(s), with N =   1000: 100x0.000005s
accuracy of an MxN convolution, with M = 1000, N =   1000: mean err=6.1e-03, max err=3.6e-02
timing of 100 NxN convolution(s), with N =   2000: 100x0.000005s
accuracy of an MxN convolution, with M = 1000, N =   2000: mean err=4.6e-03, max err=3.2e-02
timing of 100 NxN convolution(s), with N =   5000: 100x0.000006s
accuracy of an MxN convolution, with M = 1000, N =   5000: mean err=8.6e-03, max err=7.6e-02
timing of 100 NxN convolution(s), with N =  10000: 100x0.000008s
accuracy of an MxN convolution, with M = 1000, N =  10000: mean err=1.2e-02, max err=9.0e-02
timing of 100 NxN convolution(s), with N =  20000: 100x0.000011s
accuracy of an MxN convolution, with M = 1000, N =  20000: mean err=1.3e-02, max err=9.4e-02
timing of 100 NxN convolution(s), with N =  50000: 100x0.000021s
accuracy of an MxN convolution, with M = 1000, N =  50000: mean err=2.0e-02, max err=1.1e-01
timing of  10 NxN convolution(s), with N = 100000:  10x0.000510s
accuracy of an MxN convolution, with M = 1000, N = 100000: mean err=6.0e-02, max err=4.3e-01
timing of  10 NxN convolution(s), with N = 200000:  10x0.001445s
accuracy of an MxN convolution, with M = 1000, N = 200000: mean err=9.4e-02, max err=5.9e-01
timing of  10 NxN convolution(s), with N = 500000:  10x0.008434s
accuracy of an MxN convolution, with M = 1000, N = 500000: mean err=8.7e-02, max err=6.3e-01
timing of   1 NxN convolution(s), with N =1000000:   1x0.316824s
accuracy of an MxN convolution, with M = 1000, N =1000000: mean err=1.9e-01, max err=1.2e+00
Backend : float16, block_sum, Device : cuda, dtype : float16, dtype_acc : float16, sum_scheme : block_sum, use_fast_math : True -------------
timing of 100 NxN convolution(s), with N =    100: 100x0.000006s
accuracy of an MxN convolution, with M = 100, N =    100: mean err=1.5e-03, max err=6.9e-03
timing of 100 NxN convolution(s), with N =    200: 100x0.000005s
accuracy of an MxN convolution, with M = 200, N =    200: mean err=3.6e-03, max err=1.6e-02
timing of 100 NxN convolution(s), with N =    500: 100x0.000005s
accuracy of an MxN convolution, with M = 500, N =    500: mean err=2.6e-03, max err=1.5e-02
timing of 100 NxN convolution(s), with N =   1000: 100x0.000005s
accuracy of an MxN convolution, with M = 1000, N =   1000: mean err=4.0e-03, max err=2.1e-02
timing of 100 NxN convolution(s), with N =   2000: 100x0.000005s
accuracy of an MxN convolution, with M = 1000, N =   2000: mean err=1.5e-03, max err=6.7e-03
timing of 100 NxN convolution(s), with N =   5000: 100x0.000006s
accuracy of an MxN convolution, with M = 1000, N =   5000: mean err=2.2e-03, max err=1.1e-02
timing of 100 NxN convolution(s), with N =  10000: 100x0.000008s
accuracy of an MxN convolution, with M = 1000, N =  10000: mean err=1.8e-03, max err=1.0e-02
timing of 100 NxN convolution(s), with N =  20000: 100x0.000011s
accuracy of an MxN convolution, with M = 1000, N =  20000: mean err=2.2e-03, max err=9.3e-03
timing of 100 NxN convolution(s), with N =  50000: 100x0.000021s
accuracy of an MxN convolution, with M = 1000, N =  50000: mean err=2.1e-03, max err=1.0e-02
timing of  10 NxN convolution(s), with N = 100000:  10x0.000533s
accuracy of an MxN convolution, with M = 1000, N = 100000: mean err=6.3e-03, max err=3.7e-02
timing of  10 NxN convolution(s), with N = 200000:  10x0.001534s
accuracy of an MxN convolution, with M = 1000, N = 200000: mean err=8.1e-03, max err=3.7e-02
timing of  10 NxN convolution(s), with N = 500000:  10x0.008915s
accuracy of an MxN convolution, with M = 1000, N = 500000: mean err=7.1e-03, max err=6.4e-02
timing of   1 NxN convolution(s), with N =1000000:   1x0.334264s
accuracy of an MxN convolution, with M = 1000, N =1000000: mean err=1.4e-02, max err=1.4e-01
Backend : float16, kahan_scheme, Device : cuda, dtype : float16, dtype_acc : float16, sum_scheme : kahan_scheme, use_fast_math : True -------------
timing of 100 NxN convolution(s), with N =    100: 100x0.000006s
accuracy of an MxN convolution, with M = 100, N =    100: mean err=1.0e-03, max err=3.0e-03
timing of 100 NxN convolution(s), with N =    200: 100x0.000006s
accuracy of an MxN convolution, with M = 200, N =    200: mean err=1.6e-03, max err=4.0e-03
timing of 100 NxN convolution(s), with N =    500: 100x0.000005s
accuracy of an MxN convolution, with M = 500, N =    500: mean err=9.4e-04, max err=4.8e-03
timing of 100 NxN convolution(s), with N =   1000: 100x0.000005s
accuracy of an MxN convolution, with M = 1000, N =   1000: mean err=9.4e-04, max err=4.3e-03
timing of 100 NxN convolution(s), with N =   2000: 100x0.000006s
accuracy of an MxN convolution, with M = 1000, N =   2000: mean err=4.6e-04, max err=1.8e-03
timing of 100 NxN convolution(s), with N =   5000: 100x0.000007s
accuracy of an MxN convolution, with M = 1000, N =   5000: mean err=6.6e-04, max err=2.9e-03
timing of 100 NxN convolution(s), with N =  10000: 100x0.000009s
accuracy of an MxN convolution, with M = 1000, N =  10000: mean err=4.3e-04, max err=1.9e-03
timing of 100 NxN convolution(s), with N =  20000: 100x0.000013s
accuracy of an MxN convolution, with M = 1000, N =  20000: mean err=5.3e-04, max err=2.6e-03
timing of 100 NxN convolution(s), with N =  50000: 100x0.000028s
accuracy of an MxN convolution, with M = 1000, N =  50000: mean err=4.1e-04, max err=1.9e-03
timing of  10 NxN convolution(s), with N = 100000:  10x0.000697s
accuracy of an MxN convolution, with M = 1000, N = 100000: mean err=1.2e-03, max err=5.5e-03
timing of  10 NxN convolution(s), with N = 200000:  10x0.002033s
accuracy of an MxN convolution, with M = 1000, N = 200000: mean err=1.5e-03, max err=6.9e-03
timing of   1 NxN convolution(s), with N = 500000:   1x0.118587s
accuracy of an MxN convolution, with M = 1000, N = 500000: mean err=5.1e-04, max err=2.3e-03
timing of   1 NxN convolution(s), with N =1000000:   1x0.443472s
accuracy of an MxN convolution, with M = 1000, N =1000000: mean err=1.6e-03, max err=6.8e-03
Backend : float16, float32 acc, Device : cuda, dtype : float16, dtype_acc : float32, sum_scheme : block_sum, use_fast_math : True -------------
timing of 100 NxN convolution(s), with N =    100: 100x0.000006s
accuracy of an MxN convolution, with M = 100, N =    100: mean err=1.5e-03, max err=6.9e-03
timing of 100 NxN convolution(s), with N =    200: 100x0.000005s
accuracy of an MxN convolution, with M = 200, N =    200: mean err=3.6e-03, max err=1.6e-02
timing of 100 NxN convolution(s), with N =    500: 100x0.000005s
accuracy of an MxN convolution, with M = 500, N =    500: mean err=2.6e-03, max err=1.5e-02
timing of 100 NxN convolution(s), with N =   1000: 100x0.000005s
accuracy of an MxN convolution, with M = 1000, N =   1000: mean err=4.0e-03, max err=2.2e-02
timing of 100 NxN convolution(s), with N =   2000: 100x0.000005s
accuracy of an MxN convolution, with M = 1000, N =   2000: mean err=1.5e-03, max err=6.8e-03
timing of 100 NxN convolution(s), with N =   5000: 100x0.000006s
accuracy of an MxN convolution, with M = 1000, N =   5000: mean err=2.1e-03, max err=9.7e-03
timing of 100 NxN convolution(s), with N =  10000: 100x0.000008s
accuracy of an MxN convolution, with M = 1000, N =  10000: mean err=1.5e-03, max err=7.3e-03
timing of 100 NxN convolution(s), with N =  20000: 100x0.000011s
accuracy of an MxN convolution, with M = 1000, N =  20000: mean err=1.9e-03, max err=9.0e-03
timing of 100 NxN convolution(s), with N =  50000: 100x0.000022s
accuracy of an MxN convolution, with M = 1000, N =  50000: mean err=1.4e-03, max err=7.9e-03
timing of  10 NxN convolution(s), with N = 100000:  10x0.000523s
accuracy of an MxN convolution, with M = 1000, N = 100000: mean err=4.5e-03, max err=2.4e-02
timing of  10 NxN convolution(s), with N = 200000:  10x0.001474s
accuracy of an MxN convolution, with M = 1000, N = 200000: mean err=4.4e-03, max err=2.2e-02
timing of  10 NxN convolution(s), with N = 500000:  10x0.008531s
accuracy of an MxN convolution, with M = 1000, N = 500000: mean err=1.4e-03, max err=6.7e-03
timing of   1 NxN convolution(s), with N =1000000:   1x0.320990s
accuracy of an MxN convolution, with M = 1000, N =1000000: mean err=5.5e-03, max err=3.2e-02
Backend : float32, direct_sum, Device : cuda, dtype : float32, dtype_acc : float32, sum_scheme : direct_sum, use_fast_math : True -------------
timing of 100 NxN convolution(s), with N =    100: 100x0.000004s
accuracy of an MxN convolution, with M = 100, N =    100: mean err=1.4e-07, max err=5.8e-07
timing of 100 NxN convolution(s), with N =    200: 100x0.000004s
accuracy of an MxN convolution, with M = 200, N =    200: mean err=4.2e-07, max err=1.7e-06
timing of 100 NxN convolution(s), with N =    500: 100x0.000003s
accuracy of an MxN convolution, with M = 500, N =    500: mean err=4.0e-07, max err=2.4e-06
timing of 100 NxN convolution(s), with N =   1000: 100x0.000003s
accuracy of an MxN convolution, with M = 1000, N =   1000: mean err=6.4e-07, max err=3.8e-06
timing of 100 NxN convolution(s), with N =   2000: 100x0.000004s
accuracy of an MxN convolution, with M = 1000, N =   2000: mean err=5.0e-07, max err=5.3e-06
timing of 100 NxN convolution(s), with N =   5000: 100x0.000004s
accuracy of an MxN convolution, with M = 1000, N =   5000: mean err=1.1e-06, max err=6.5e-06
timing of 100 NxN convolution(s), with N =  10000: 100x0.000006s
accuracy of an MxN convolution, with M = 1000, N =  10000: mean err=1.7e-06, max err=1.1e-05
timing of 100 NxN convolution(s), with N =  20000: 100x0.000008s
accuracy of an MxN convolution, with M = 1000, N =  20000: mean err=1.7e-06, max err=1.1e-05
timing of 100 NxN convolution(s), with N =  50000: 100x0.000034s
accuracy of an MxN convolution, with M = 1000, N =  50000: mean err=2.7e-06, max err=1.5e-05
timing of  10 NxN convolution(s), with N = 100000:  10x0.001018s
accuracy of an MxN convolution, with M = 1000, N = 100000: mean err=6.1e-06, max err=4.3e-05
timing of  10 NxN convolution(s), with N = 200000:  10x0.003838s
accuracy of an MxN convolution, with M = 1000, N = 200000: mean err=9.8e-06, max err=6.4e-05
timing of   1 NxN convolution(s), with N = 500000:   1x0.233670s
accuracy of an MxN convolution, with M = 1000, N = 500000: mean err=1.1e-05, max err=1.1e-04
timing of   1 NxN convolution(s), with N =1000000:   1x0.909367s
accuracy of an MxN convolution, with M = 1000, N =1000000: mean err=2.6e-05, max err=1.9e-04
Backend : float32, block_sum, Device : cuda, dtype : float32, dtype_acc : float32, sum_scheme : block_sum, use_fast_math : True -------------
timing of 100 NxN convolution(s), with N =    100: 100x0.000004s
accuracy of an MxN convolution, with M = 100, N =    100: mean err=1.4e-07, max err=5.8e-07
timing of 100 NxN convolution(s), with N =    200: 100x0.000004s
accuracy of an MxN convolution, with M = 200, N =    200: mean err=4.2e-07, max err=1.8e-06
timing of 100 NxN convolution(s), with N =    500: 100x0.000003s
accuracy of an MxN convolution, with M = 500, N =    500: mean err=3.0e-07, max err=1.6e-06
timing of 100 NxN convolution(s), with N =   1000: 100x0.000003s
accuracy of an MxN convolution, with M = 1000, N =   1000: mean err=4.3e-07, max err=2.5e-06
timing of 100 NxN convolution(s), with N =   2000: 100x0.000004s
accuracy of an MxN convolution, with M = 1000, N =   2000: mean err=1.4e-07, max err=7.6e-07
timing of 100 NxN convolution(s), with N =   5000: 100x0.000005s
accuracy of an MxN convolution, with M = 1000, N =   5000: mean err=2.8e-07, max err=1.5e-06
timing of 100 NxN convolution(s), with N =  10000: 100x0.000006s
accuracy of an MxN convolution, with M = 1000, N =  10000: mean err=2.2e-07, max err=1.1e-06
timing of 100 NxN convolution(s), with N =  20000: 100x0.000009s
accuracy of an MxN convolution, with M = 1000, N =  20000: mean err=2.8e-07, max err=1.5e-06
timing of 100 NxN convolution(s), with N =  50000: 100x0.000033s
accuracy of an MxN convolution, with M = 1000, N =  50000: mean err=2.7e-07, max err=1.8e-06
timing of  10 NxN convolution(s), with N = 100000:  10x0.000977s
accuracy of an MxN convolution, with M = 1000, N = 100000: mean err=6.7e-07, max err=2.8e-06
timing of  10 NxN convolution(s), with N = 200000:  10x0.003618s
accuracy of an MxN convolution, with M = 1000, N = 200000: mean err=9.1e-07, max err=4.5e-06
timing of   1 NxN convolution(s), with N = 500000:   1x0.218077s
accuracy of an MxN convolution, with M = 1000, N = 500000: mean err=8.4e-07, max err=7.5e-06
timing of   1 NxN convolution(s), with N =1000000:   1x0.849226s
accuracy of an MxN convolution, with M = 1000, N =1000000: mean err=2.1e-06, max err=1.5e-05
Backend : float32, block_sum, Device : cuda, dtype : float32, dtype_acc : float32, sum_scheme : block_sum, use_fast_math : False -------------
timing of 100 NxN convolution(s), with N =    100: 100x0.000004s
accuracy of an MxN convolution, with M = 100, N =    100: mean err=1.4e-07, max err=5.8e-07
timing of 100 NxN convolution(s), with N =    200: 100x0.000004s
accuracy of an MxN convolution, with M = 200, N =    200: mean err=4.2e-07, max err=1.8e-06
timing of 100 NxN convolution(s), with N =    500: 100x0.000003s
accuracy of an MxN convolution, with M = 500, N =    500: mean err=3.0e-07, max err=1.6e-06
timing of 100 NxN convolution(s), with N =   1000: 100x0.000003s
accuracy of an MxN convolution, with M = 1000, N =   1000: mean err=4.3e-07, max err=2.5e-06
timing of 100 NxN convolution(s), with N =   2000: 100x0.000004s
accuracy of an MxN convolution, with M = 1000, N =   2000: mean err=1.4e-07, max err=7.6e-07
timing of 100 NxN convolution(s), with N =   5000: 100x0.000005s
accuracy of an MxN convolution, with M = 1000, N =   5000: mean err=2.8e-07, max err=1.5e-06
timing of 100 NxN convolution(s), with N =  10000: 100x0.000006s
accuracy of an MxN convolution, with M = 1000, N =  10000: mean err=2.2e-07, max err=1.1e-06
timing of 100 NxN convolution(s), with N =  20000: 100x0.000009s
accuracy of an MxN convolution, with M = 1000, N =  20000: mean err=2.8e-07, max err=1.5e-06
timing of 100 NxN convolution(s), with N =  50000: 100x0.000033s
accuracy of an MxN convolution, with M = 1000, N =  50000: mean err=2.7e-07, max err=1.8e-06
timing of  10 NxN convolution(s), with N = 100000:  10x0.000977s
accuracy of an MxN convolution, with M = 1000, N = 100000: mean err=6.7e-07, max err=2.8e-06
timing of  10 NxN convolution(s), with N = 200000:  10x0.003618s
accuracy of an MxN convolution, with M = 1000, N = 200000: mean err=9.1e-07, max err=4.5e-06
timing of   1 NxN convolution(s), with N = 500000:   1x0.218071s
accuracy of an MxN convolution, with M = 1000, N = 500000: mean err=8.4e-07, max err=7.5e-06
timing of   1 NxN convolution(s), with N =1000000:   1x0.846144s
accuracy of an MxN convolution, with M = 1000, N =1000000: mean err=2.1e-06, max err=1.5e-05
Backend : float32, kahan_scheme, Device : cuda, dtype : float32, dtype_acc : float32, sum_scheme : kahan_scheme, use_fast_math : True -------------
timing of 100 NxN convolution(s), with N =    100: 100x0.000004s
accuracy of an MxN convolution, with M = 100, N =    100: mean err=8.6e-08, max err=3.2e-07
timing of 100 NxN convolution(s), with N =    200: 100x0.000004s
accuracy of an MxN convolution, with M = 200, N =    200: mean err=1.2e-07, max err=4.2e-07
timing of 100 NxN convolution(s), with N =    500: 100x0.000003s
accuracy of an MxN convolution, with M = 500, N =    500: mean err=1.2e-07, max err=6.3e-07
timing of 100 NxN convolution(s), with N =   1000: 100x0.000003s
accuracy of an MxN convolution, with M = 1000, N =   1000: mean err=1.3e-07, max err=7.1e-07
timing of 100 NxN convolution(s), with N =   2000: 100x0.000004s
accuracy of an MxN convolution, with M = 1000, N =   2000: mean err=5.4e-08, max err=3.6e-07
timing of 100 NxN convolution(s), with N =   5000: 100x0.000005s
accuracy of an MxN convolution, with M = 1000, N =   5000: mean err=9.0e-08, max err=5.0e-07
timing of 100 NxN convolution(s), with N =  10000: 100x0.000007s
accuracy of an MxN convolution, with M = 1000, N =  10000: mean err=6.3e-08, max err=3.2e-07
timing of 100 NxN convolution(s), with N =  20000: 100x0.000011s
accuracy of an MxN convolution, with M = 1000, N =  20000: mean err=6.5e-08, max err=3.3e-07
timing of 100 NxN convolution(s), with N =  50000: 100x0.000047s
accuracy of an MxN convolution, with M = 1000, N =  50000: mean err=5.0e-08, max err=2.7e-07
timing of  10 NxN convolution(s), with N = 100000:  10x0.001465s
accuracy of an MxN convolution, with M = 1000, N = 100000: mean err=1.4e-07, max err=6.6e-07
timing of  10 NxN convolution(s), with N = 200000:  10x0.005598s
accuracy of an MxN convolution, with M = 1000, N = 200000: mean err=1.4e-07, max err=6.3e-07
timing of   1 NxN convolution(s), with N = 500000:   1x0.342779s
accuracy of an MxN convolution, with M = 1000, N = 500000: mean err=5.5e-08, max err=3.2e-07
timing of   1 NxN convolution(s), with N =1000000:   1x1.336671s
accuracy of an MxN convolution, with M = 1000, N =1000000: mean err=2.0e-07, max err=1.0e-06
Backend : float32, float64 acc, Device : cuda, dtype : float32, dtype_acc : float64, sum_scheme : block_sum, use_fast_math : True -------------
timing of 100 NxN convolution(s), with N =    100: 100x0.000004s
accuracy of an MxN convolution, with M = 100, N =    100: mean err=1.4e-07, max err=5.8e-07
timing of 100 NxN convolution(s), with N =    200: 100x0.000004s
accuracy of an MxN convolution, with M = 200, N =    200: mean err=4.2e-07, max err=1.8e-06
timing of 100 NxN convolution(s), with N =    500: 100x0.000003s
accuracy of an MxN convolution, with M = 500, N =    500: mean err=3.0e-07, max err=1.6e-06
timing of 100 NxN convolution(s), with N =   1000: 100x0.000003s
accuracy of an MxN convolution, with M = 1000, N =   1000: mean err=4.3e-07, max err=2.5e-06
timing of 100 NxN convolution(s), with N =   2000: 100x0.000004s
accuracy of an MxN convolution, with M = 1000, N =   2000: mean err=1.4e-07, max err=7.6e-07
timing of 100 NxN convolution(s), with N =   5000: 100x0.000005s
accuracy of an MxN convolution, with M = 1000, N =   5000: mean err=2.6e-07, max err=1.5e-06
timing of 100 NxN convolution(s), with N =  10000: 100x0.000006s
accuracy of an MxN convolution, with M = 1000, N =  10000: mean err=1.8e-07, max err=1.0e-06
timing of 100 NxN convolution(s), with N =  20000: 100x0.000009s
accuracy of an MxN convolution, with M = 1000, N =  20000: mean err=2.5e-07, max err=1.3e-06
timing of 100 NxN convolution(s), with N =  50000: 100x0.000033s
accuracy of an MxN convolution, with M = 1000, N =  50000: mean err=1.8e-07, max err=7.4e-07
timing of  10 NxN convolution(s), with N = 100000:  10x0.000977s
accuracy of an MxN convolution, with M = 1000, N = 100000: mean err=5.1e-07, max err=2.8e-06
timing of  10 NxN convolution(s), with N = 200000:  10x0.003622s
accuracy of an MxN convolution, with M = 1000, N = 200000: mean err=5.0e-07, max err=2.8e-06
timing of   1 NxN convolution(s), with N = 500000:   1x0.218161s
accuracy of an MxN convolution, with M = 1000, N = 500000: mean err=1.8e-07, max err=8.1e-07
timing of   1 NxN convolution(s), with N =1000000:   1x0.846391s
accuracy of an MxN convolution, with M = 1000, N =1000000: mean err=6.9e-07, max err=3.7e-06
Backend : float64, direct_sum, Device : cuda, dtype : float64, dtype_acc : float64, sum_scheme : direct_sum, use_fast_math : True -------------
timing of 100 NxN convolution(s), with N =    100: 100x0.000004s
accuracy of an MxN convolution, with M = 100, N =    100: mean err=2.1e-16, max err=1.3e-15
timing of 100 NxN convolution(s), with N =    200: 100x0.000004s
accuracy of an MxN convolution, with M = 200, N =    200: mean err=8.7e-16, max err=3.7e-15
timing of 100 NxN convolution(s), with N =    500: 100x0.000004s
accuracy of an MxN convolution, with M = 500, N =    500: mean err=8.1e-16, max err=4.7e-15
timing of 100 NxN convolution(s), with N =   1000: 100x0.000003s
accuracy of an MxN convolution, with M = 1000, N =   1000: mean err=1.2e-15, max err=6.0e-15
timing of 100 NxN convolution(s), with N =   2000: 100x0.000004s
accuracy of an MxN convolution, with M = 1000, N =   2000: mean err=9.2e-16, max err=6.2e-15
timing of 100 NxN convolution(s), with N =   5000: 100x0.000005s
accuracy of an MxN convolution, with M = 1000, N =   5000: mean err=2.0e-15, max err=1.4e-14
timing of 100 NxN convolution(s), with N =  10000: 100x0.000008s
accuracy of an MxN convolution, with M = 1000, N =  10000: mean err=3.1e-15, max err=1.8e-14
timing of 100 NxN convolution(s), with N =  20000: 100x0.000012s
accuracy of an MxN convolution, with M = 1000, N =  20000: mean err=3.1e-15, max err=2.1e-14
timing of 100 NxN convolution(s), with N =  50000: 100x0.000063s
accuracy of an MxN convolution, with M = 1000, N =  50000: mean err=4.6e-15, max err=3.4e-14
timing of  10 NxN convolution(s), with N = 100000:  10x0.001977s
accuracy of an MxN convolution, with M = 1000, N = 100000: mean err=1.1e-14, max err=6.4e-14
timing of  10 NxN convolution(s), with N = 200000:  10x0.007663s
accuracy of an MxN convolution, with M = 1000, N = 200000: mean err=1.8e-14, max err=1.4e-13
timing of   1 NxN convolution(s), with N = 500000:   1x0.472913s
accuracy of an MxN convolution, with M = 1000, N = 500000: mean err=2.1e-14, max err=1.3e-13
timing of   1 NxN convolution(s), with N =1000000:   1x1.843502s
accuracy of an MxN convolution, with M = 1000, N =1000000: mean err=5.0e-14, max err=3.1e-13
Backend : float64, block_sum, Device : cuda, dtype : float64, dtype_acc : float64, sum_scheme : block_sum, use_fast_math : True -------------
timing of 100 NxN convolution(s), with N =    100: 100x0.000004s
accuracy of an MxN convolution, with M = 100, N =    100: mean err=2.1e-16, max err=1.3e-15
timing of 100 NxN convolution(s), with N =    200: 100x0.000003s
accuracy of an MxN convolution, with M = 200, N =    200: mean err=8.7e-16, max err=3.7e-15
timing of 100 NxN convolution(s), with N =    500: 100x0.000003s
accuracy of an MxN convolution, with M = 500, N =    500: mean err=5.6e-16, max err=2.8e-15
timing of 100 NxN convolution(s), with N =   1000: 100x0.000003s
accuracy of an MxN convolution, with M = 1000, N =   1000: mean err=7.7e-16, max err=3.5e-15
timing of 100 NxN convolution(s), with N =   2000: 100x0.000004s
accuracy of an MxN convolution, with M = 1000, N =   2000: mean err=2.5e-16, max err=1.5e-15
timing of 100 NxN convolution(s), with N =   5000: 100x0.000005s
accuracy of an MxN convolution, with M = 1000, N =   5000: mean err=4.8e-16, max err=2.6e-15
timing of 100 NxN convolution(s), with N =  10000: 100x0.000007s
accuracy of an MxN convolution, with M = 1000, N =  10000: mean err=4.0e-16, max err=2.2e-15
timing of 100 NxN convolution(s), with N =  20000: 100x0.000012s
accuracy of an MxN convolution, with M = 1000, N =  20000: mean err=5.0e-16, max err=3.0e-15
timing of 100 NxN convolution(s), with N =  50000: 100x0.000062s
accuracy of an MxN convolution, with M = 1000, N =  50000: mean err=4.8e-16, max err=2.5e-15
timing of  10 NxN convolution(s), with N = 100000:  10x0.001976s
accuracy of an MxN convolution, with M = 1000, N = 100000: mean err=1.3e-15, max err=5.8e-15
timing of  10 NxN convolution(s), with N = 200000:  10x0.007664s
accuracy of an MxN convolution, with M = 1000, N = 200000: mean err=1.6e-15, max err=7.9e-15
timing of   1 NxN convolution(s), with N = 500000:   1x0.473102s
accuracy of an MxN convolution, with M = 1000, N = 500000: mean err=1.5e-15, max err=1.3e-14
timing of   1 NxN convolution(s), with N =1000000:   1x1.841738s
accuracy of an MxN convolution, with M = 1000, N =1000000: mean err=3.7e-15, max err=2.3e-14
Backend : float64, kahan_scheme, Device : cuda, dtype : float64, dtype_acc : float64, sum_scheme : kahan_scheme, use_fast_math : True -------------
timing of 100 NxN convolution(s), with N =    100: 100x0.000004s
accuracy of an MxN convolution, with M = 100, N =    100: mean err=7.9e-17, max err=4.5e-16
timing of 100 NxN convolution(s), with N =    200: 100x0.000004s
accuracy of an MxN convolution, with M = 200, N =    200: mean err=1.3e-16, max err=6.0e-16
timing of 100 NxN convolution(s), with N =    500: 100x0.000003s
accuracy of an MxN convolution, with M = 500, N =    500: mean err=1.2e-16, max err=5.5e-16
timing of 100 NxN convolution(s), with N =   1000: 100x0.000004s
accuracy of an MxN convolution, with M = 1000, N =   1000: mean err=1.3e-16, max err=5.5e-16
timing of 100 NxN convolution(s), with N =   2000: 100x0.000004s
accuracy of an MxN convolution, with M = 1000, N =   2000: mean err=4.5e-17, max err=4.3e-16
timing of 100 NxN convolution(s), with N =   5000: 100x0.000006s
accuracy of an MxN convolution, with M = 1000, N =   5000: mean err=6.9e-17, max err=4.4e-16
timing of 100 NxN convolution(s), with N =  10000: 100x0.000010s
accuracy of an MxN convolution, with M = 1000, N =  10000: mean err=4.6e-17, max err=4.5e-16
timing of 100 NxN convolution(s), with N =  20000: 100x0.000016s
accuracy of an MxN convolution, with M = 1000, N =  20000: mean err=5.6e-17, max err=3.8e-16
timing of 100 NxN convolution(s), with N =  50000: 100x0.000088s
accuracy of an MxN convolution, with M = 1000, N =  50000: mean err=4.2e-17, max err=3.6e-16
timing of  10 NxN convolution(s), with N = 100000:  10x0.002785s
accuracy of an MxN convolution, with M = 1000, N = 100000: mean err=1.2e-16, max err=7.7e-16
timing of   1 NxN convolution(s), with N = 200000:   1x0.108043s
accuracy of an MxN convolution, with M = 1000, N = 200000: mean err=1.3e-16, max err=7.9e-16
timing of   1 NxN convolution(s), with N = 500000:   1x0.665048s
accuracy of an MxN convolution, with M = 1000, N = 500000: mean err=4.3e-17, max err=4.7e-16
timing of   1 NxN convolution(s), with N =1000000:   1x2.605435s
accuracy of an MxN convolution, with M = 1000, N =1000000: mean err=1.7e-16, max err=9.7e-16

Total running time of the script: (0 minutes 38.054 seconds)

Gallery generated by Sphinx-Gallery