Note

Click here to download the full example code

# Mixed-precision and accuracy settings¶

We test various options of KeOps regarding accuracy of computations.

## Setup¶

```
output_filename = "accuracy"
import importlib
import os
import time
import numpy as np
import torch
from matplotlib import pyplot as plt
use_cuda = torch.cuda.is_available()
D = 3
```

Benchmark specifications:

```
MAXTIME = 10 if use_cuda else 1 # Max number of seconds before we break the loop
REDTIME = 2 if use_cuda else .2 # Decrease the number of runs if computations take longer than 2s...
# Number of samples that we'll loop upon
NS = [100, 200, 500,
1000, 2000, 5000,
10000, 20000, 50000,
100000, 200000, 500000,
1000000, 2000000, 5000000]
```

Synthetic dataset.

```
def generate_samples(N, D, device, lang):
"""Create point clouds sampled non-uniformly on a sphere of diameter 1."""
if lang == 'torch':
if device == 'cuda':
torch.cuda.manual_seed_all(123)
else:
torch.manual_seed(123)
x = torch.rand((N, D), device=device, dtype=torch.float64)
y = torch.rand((N, D), device=device, dtype=torch.float64)
# Draw a random source signal:
b = torch.randn((N, 1), device=device, dtype=torch.float64)
else:
np.random.seed(1234)
x = np.random.randn(*((N, D)))
y = np.random.randn(*((N, D)))
b = np.random.randn(*((N,)))
return x, y, b
```

Define a simple RBF product, using the `pykeops.torch.LazyTensor`

wrapper:

```
from pykeops.torch import LazyTensor
def conv_lazytensor(x, y, b, dtype, dtype_acc, sum_scheme):
backend = 'GPU' if use_cuda else 'CPU'
x_i = LazyTensor(x.unsqueeze(-2)) # (M, 1, D)
y_j = LazyTensor(y.unsqueeze(-3)) # (1, N, D)
K_ij = ((x_i - y_j) ** 2).sum(-1) # (M, N, 1)
S_ij = K_ij * b.unsqueeze(-3) # (M, N, 1) * (1, N, 1)
return S_ij.sum(dim=1, backend=backend, dtype_acc=dtype_acc, sum_scheme=sum_scheme)
```

## Benchmarking loops¶

```
def benchmark(Routine, dev, N, D, loops, lang, dtype, dtype_acc, sum_scheme):
"""Times a convolution on an N-by-N problem, and evaluate accuracy."""
importlib.reload(torch) # In case we had a memory overflow just before...
device = torch.device(dev)
x_, y_, b_ = generate_samples(N, D, device, lang)
if dtype=="float16":
torch_dtype = torch.float16
if dtype=="float32":
torch_dtype = torch.float32
elif dtype=="float64":
torch_dtype = torch.float64
x, y, b = x_.to(torch_dtype), y_.to(torch_dtype), b_.to(torch_dtype)
# We simply benchmark a convolution
N0 = min(N,100)
Routine( x[:N0,:], y[:N0,:], b[:N0,:], dtype, dtype_acc, sum_scheme ) # Warmup run, to compile and load everything
# timings
if loops>0:
code = "out = Routine( x, y, b, dtype, dtype_acc, sum_scheme ) "
t_0 = time.perf_counter() # Actual benchmark --------------------
if use_cuda: torch.cuda.synchronize()
for i in range(loops):
exec( code, locals() )
if use_cuda: torch.cuda.synchronize()
elapsed = time.perf_counter() - t_0 # ---------------------------
elapsed /= loops
print("timing of {:3} NxN convolution(s), with N ={:7}: {:3}x{:3.6f}s".format(loops, N, loops, elapsed / loops))
else:
elapsed = np.NaN
# accuracy
ind = torch.randperm(y.shape[0])
M = min(N,1000) # we evaluate accuracy on a subsample of outputs only because computations with full precisions are slow.
out = Routine( x[:M,:], y[ind,:], b[ind,:], dtype, dtype_acc, sum_scheme )
ref_out = Routine( x_[:M,:], y_, b_, "float64", "float64", "kahan_scheme" )
mean_err = ((out.double()-ref_out.double()).abs().mean()/ref_out.double().abs().mean()).item()
mean_err = float('NaN') if mean_err==0 else mean_err
max_err = ((out.double()-ref_out.double()).abs().max()/ref_out.double().abs().mean()).item()
max_err = float('NaN') if max_err==0 else max_err
print("accuracy of an MxN convolution, with M = {}, N ={:7}: mean err={:.1e}, max err={:.1e}".format(M, N, mean_err, max_err))
return elapsed, mean_err, max_err
def bench_config(Routine, backend, dev, lang, dtype, dtype_acc, sum_scheme) :
"""Times a convolution for an increasing number of samples."""
print("Backend : {}, Device : {}, dtype : {}, dtype_acc : {}, sum_scheme : {} -------------".format(backend, dev, dtype, dtype_acc, sum_scheme))
times = []
mean_errs = []
max_errs = []
try :
Nloops = [100, 10, 1, 0]
nloops = Nloops.pop(0)
for n in NS :
elapsed, mean_err, max_err = benchmark(Routine, dev, n, D, nloops, lang, dtype, dtype_acc, sum_scheme)
times.append( elapsed )
mean_errs.append( mean_err )
max_errs.append( max_err )
if nloops > 0:
if (nloops * elapsed > MAXTIME) or (nloops * elapsed > REDTIME/10 and nloops>1) :
nloops = Nloops.pop(0)
except RuntimeError :
print("**\nMemory overflow !")
except IndexError :
print("**\nToo slow !")
fill_nans = (len(NS)-len(times)) * [np.nan]
return times + fill_nans, mean_errs + fill_nans, max_errs + fill_nans
def full_bench(title, routines) :
"""Benchmarks the varied options of a geometric loss function."""
backends = [ backend for (_, backend, _, _, _, _) in routines ]
print("Benchmarking : {} ===============================".format(title))
lines_times = [ NS ]
lines_mean_errs = [ NS ]
lines_max_errs = [ NS ]
for routine, backend, lang, dtype, dtype_acc, sum_scheme in routines :
res = bench_config(routine, backend, "cuda" if use_cuda else "cpu", lang, dtype, dtype_acc, sum_scheme)
lines_times.append(res[0])
lines_mean_errs.append(res[1])
lines_max_errs.append(res[2])
benches_times = np.array(lines_times).T
benches_mean_errs = np.array(lines_mean_errs).T
benches_max_errs = np.array(lines_max_errs).T
for ind_benches, benches in enumerate((benches_times, benches_mean_errs, benches_max_errs)):
# Creates a pyplot figure:
plt.figure(figsize=(12,8))
linestyles = ["o-", "s-", "^-", "<-", ">-", "v-", "+-", "*-", "x-", "p-", "d-"]
for i, config in enumerate(routines):
plt.plot( benches[:,0], benches[:,i+1], linestyles[i],
linewidth=2, label='config = "{}"'.format(config[3:]) )
plt.xlabel('Number of samples')
if ind_benches==0:
plt.title('Runtimes for {} in dimension {}'.format(title, D))
plt.ylabel('Seconds')
elif ind_benches==1:
plt.title('Mean errors for {} in dimension {}'.format(title, D))
plt.ylabel('Relative mean error')
elif ind_benches==2:
plt.title('Max errors for {} in dimension {}'.format(title, D))
plt.ylabel('Relative max error')
plt.yscale('log') ; plt.xscale('log')
plt.legend(loc='upper left')
plt.grid(True, which="major", linestyle="-")
plt.grid(True, which="minor", linestyle="dotted")
true_vals = benches[:,1:].flatten()
true_vals = true_vals[np.isfinite(true_vals)]
if ind_benches==0:
plt.axis([NS[0], NS[-1], true_vals.min(), MAXTIME])
else:
plt.axis([NS[0], NS[-1], true_vals.min(), 100*true_vals.max()])
plt.tight_layout()
# Save as a .csv to put a nice Tikz figure in the papers:
header = "Npoints " + " ".join(backends)
os.makedirs("output", exist_ok=True)
np.savetxt("output/"+output_filename+"_"+str(ind_benches)+".csv", benches,
fmt='%-9.5f', header=header, comments='')
```

## KeOps¶

```
routines = [ (conv_lazytensor, "float16, direct_sum", "torch", "float16", "float16", "direct_sum"),
(conv_lazytensor, "float16, block_sum", "torch", "float16", "float16", "block_sum"),
(conv_lazytensor, "float16, kahan_scheme", "torch", "float16", "float16", "kahan_scheme"),
(conv_lazytensor, "float16, float32 acc", "torch", "float16", "float32", "block_sum"),
(conv_lazytensor, "float32, direct_sum", "torch", "float32", "float32", "direct_sum"),
(conv_lazytensor, "float32, block_sum", "torch", "float32", "float32", "block_sum"),
(conv_lazytensor, "float32, kahan_scheme", "torch", "float32", "float32", "kahan_scheme"),
(conv_lazytensor, "float32, float64 acc", "torch", "float32", "float64", "block_sum"),
(conv_lazytensor, "float64, direct_sum", "torch", "float64", "float64", "direct_sum"),
(conv_lazytensor, "float64, block_sum", "torch", "float64", "float64", "block_sum"),
(conv_lazytensor, "float64, kahan_scheme", "torch", "float64", "float64", "kahan_scheme") ]
full_bench( " Matrix-Vector products", routines )
plt.show()
```

Out:

```
Benchmarking : Matrix-Vector products ===============================
Backend : float16, direct_sum, Device : cuda, dtype : float16, dtype_acc : float16, sum_scheme : direct_sum -------------
timing of 100 NxN convolution(s), with N = 100: 100x0.000006s
accuracy of an MxN convolution, with M = 100, N = 100: mean err=1.5e-03, max err=6.9e-03
timing of 100 NxN convolution(s), with N = 200: 100x0.000006s
accuracy of an MxN convolution, with M = 200, N = 200: mean err=3.6e-03, max err=1.7e-02
timing of 100 NxN convolution(s), with N = 500: 100x0.000006s
accuracy of an MxN convolution, with M = 500, N = 500: mean err=4.2e-03, max err=2.9e-02
timing of 100 NxN convolution(s), with N = 1000: 100x0.000006s
accuracy of an MxN convolution, with M = 1000, N = 1000: mean err=6.1e-03, max err=3.6e-02
timing of 100 NxN convolution(s), with N = 2000: 100x0.000007s
accuracy of an MxN convolution, with M = 1000, N = 2000: mean err=4.6e-03, max err=3.2e-02
timing of 100 NxN convolution(s), with N = 5000: 100x0.000007s
accuracy of an MxN convolution, with M = 1000, N = 5000: mean err=8.6e-03, max err=7.6e-02
timing of 100 NxN convolution(s), with N = 10000: 100x0.000008s
accuracy of an MxN convolution, with M = 1000, N = 10000: mean err=1.2e-02, max err=9.0e-02
timing of 100 NxN convolution(s), with N = 20000: 100x0.000011s
accuracy of an MxN convolution, with M = 1000, N = 20000: mean err=1.3e-02, max err=9.4e-02
timing of 100 NxN convolution(s), with N = 50000: 100x0.000023s
accuracy of an MxN convolution, with M = 1000, N = 50000: mean err=2.7e-02, max err=2.6e-01
timing of 10 NxN convolution(s), with N = 100000: 10x0.000730s
accuracy of an MxN convolution, with M = 1000, N = 100000: mean err=5.0e-02, max err=3.7e-01
timing of 10 NxN convolution(s), with N = 200000: 10x0.002647s
accuracy of an MxN convolution, with M = 1000, N = 200000: mean err=1.0e-01, max err=6.0e-01
timing of 1 NxN convolution(s), with N = 500000: 1x0.158905s
accuracy of an MxN convolution, with M = 1000, N = 500000: mean err=7.3e-02, max err=3.7e-01
timing of 1 NxN convolution(s), with N =1000000: 1x0.620670s
accuracy of an MxN convolution, with M = 1000, N =1000000: mean err=1.1e-01, max err=8.0e-01
timing of 1 NxN convolution(s), with N =2000000: 1x2.455769s
accuracy of an MxN convolution, with M = 1000, N =2000000: mean err=1.2e-01, max err=8.1e-01
timing of 1 NxN convolution(s), with N =5000000: 1x15.432480s
accuracy of an MxN convolution, with M = 1000, N =5000000: mean err=2.9e-01, max err=1.6e+00
Backend : float16, block_sum, Device : cuda, dtype : float16, dtype_acc : float16, sum_scheme : block_sum -------------
timing of 100 NxN convolution(s), with N = 100: 100x0.000006s
accuracy of an MxN convolution, with M = 100, N = 100: mean err=1.5e-03, max err=6.9e-03
timing of 100 NxN convolution(s), with N = 200: 100x0.000006s
accuracy of an MxN convolution, with M = 200, N = 200: mean err=3.6e-03, max err=1.6e-02
timing of 100 NxN convolution(s), with N = 500: 100x0.000006s
accuracy of an MxN convolution, with M = 500, N = 500: mean err=2.6e-03, max err=1.5e-02
timing of 100 NxN convolution(s), with N = 1000: 100x0.000006s
accuracy of an MxN convolution, with M = 1000, N = 1000: mean err=4.0e-03, max err=2.1e-02
timing of 100 NxN convolution(s), with N = 2000: 100x0.000007s
accuracy of an MxN convolution, with M = 1000, N = 2000: mean err=1.5e-03, max err=6.7e-03
timing of 100 NxN convolution(s), with N = 5000: 100x0.000007s
accuracy of an MxN convolution, with M = 1000, N = 5000: mean err=2.2e-03, max err=1.1e-02
timing of 100 NxN convolution(s), with N = 10000: 100x0.000008s
accuracy of an MxN convolution, with M = 1000, N = 10000: mean err=1.8e-03, max err=1.0e-02
timing of 100 NxN convolution(s), with N = 20000: 100x0.000011s
accuracy of an MxN convolution, with M = 1000, N = 20000: mean err=2.2e-03, max err=9.3e-03
timing of 100 NxN convolution(s), with N = 50000: 100x0.000025s
accuracy of an MxN convolution, with M = 1000, N = 50000: mean err=3.2e-03, max err=1.7e-02
timing of 10 NxN convolution(s), with N = 100000: 10x0.000714s
accuracy of an MxN convolution, with M = 1000, N = 100000: mean err=5.3e-03, max err=2.3e-02
timing of 10 NxN convolution(s), with N = 200000: 10x0.002656s
accuracy of an MxN convolution, with M = 1000, N = 200000: mean err=9.0e-03, max err=4.7e-02
timing of 1 NxN convolution(s), with N = 500000: 1x0.160120s
accuracy of an MxN convolution, with M = 1000, N = 500000: mean err=6.0e-03, max err=3.8e-02
timing of 1 NxN convolution(s), with N =1000000: 1x0.624766s
accuracy of an MxN convolution, with M = 1000, N =1000000: mean err=9.4e-03, max err=7.0e-02
timing of 1 NxN convolution(s), with N =2000000: 1x2.470396s
accuracy of an MxN convolution, with M = 1000, N =2000000: mean err=9.8e-03, max err=6.8e-02
timing of 1 NxN convolution(s), with N =5000000: 1x15.478374s
accuracy of an MxN convolution, with M = 1000, N =5000000: mean err=2.4e-02, max err=1.8e-01
Backend : float16, kahan_scheme, Device : cuda, dtype : float16, dtype_acc : float16, sum_scheme : kahan_scheme -------------
timing of 100 NxN convolution(s), with N = 100: 100x0.000006s
accuracy of an MxN convolution, with M = 100, N = 100: mean err=1.0e-03, max err=3.0e-03
timing of 100 NxN convolution(s), with N = 200: 100x0.000006s
accuracy of an MxN convolution, with M = 200, N = 200: mean err=1.6e-03, max err=4.0e-03
timing of 100 NxN convolution(s), with N = 500: 100x0.000006s
accuracy of an MxN convolution, with M = 500, N = 500: mean err=9.4e-04, max err=4.8e-03
timing of 100 NxN convolution(s), with N = 1000: 100x0.000006s
accuracy of an MxN convolution, with M = 1000, N = 1000: mean err=9.4e-04, max err=4.3e-03
timing of 100 NxN convolution(s), with N = 2000: 100x0.000007s
accuracy of an MxN convolution, with M = 1000, N = 2000: mean err=4.6e-04, max err=1.8e-03
timing of 100 NxN convolution(s), with N = 5000: 100x0.000008s
accuracy of an MxN convolution, with M = 1000, N = 5000: mean err=6.6e-04, max err=2.9e-03
timing of 100 NxN convolution(s), with N = 10000: 100x0.000010s
accuracy of an MxN convolution, with M = 1000, N = 10000: mean err=4.3e-04, max err=1.9e-03
timing of 100 NxN convolution(s), with N = 20000: 100x0.000012s
accuracy of an MxN convolution, with M = 1000, N = 20000: mean err=5.3e-04, max err=2.6e-03
timing of 100 NxN convolution(s), with N = 50000: 100x0.000029s
accuracy of an MxN convolution, with M = 1000, N = 50000: mean err=5.8e-04, max err=3.5e-03
timing of 10 NxN convolution(s), with N = 100000: 10x0.000949s
accuracy of an MxN convolution, with M = 1000, N = 100000: mean err=9.7e-04, max err=5.6e-03
timing of 10 NxN convolution(s), with N = 200000: 10x0.003500s
accuracy of an MxN convolution, with M = 1000, N = 200000: mean err=1.6e-03, max err=7.3e-03
timing of 1 NxN convolution(s), with N = 500000: 1x0.209998s
accuracy of an MxN convolution, with M = 1000, N = 500000: mean err=4.0e-04, max err=2.0e-03
timing of 1 NxN convolution(s), with N =1000000: 1x0.819025s
accuracy of an MxN convolution, with M = 1000, N =1000000: mean err=7.9e-04, max err=3.6e-03
timing of 1 NxN convolution(s), with N =2000000: 1x3.251778s
accuracy of an MxN convolution, with M = 1000, N =2000000: mean err=6.8e-04, max err=3.3e-03
timing of 1 NxN convolution(s), with N =5000000: 1x20.290438s
accuracy of an MxN convolution, with M = 1000, N =5000000: mean err=6.7e-04, max err=3.0e-03
Backend : float16, float32 acc, Device : cuda, dtype : float16, dtype_acc : float32, sum_scheme : block_sum -------------
timing of 100 NxN convolution(s), with N = 100: 100x0.000006s
accuracy of an MxN convolution, with M = 100, N = 100: mean err=1.5e-03, max err=6.9e-03
timing of 100 NxN convolution(s), with N = 200: 100x0.000006s
accuracy of an MxN convolution, with M = 200, N = 200: mean err=3.6e-03, max err=1.6e-02
timing of 100 NxN convolution(s), with N = 500: 100x0.000006s
accuracy of an MxN convolution, with M = 500, N = 500: mean err=2.6e-03, max err=1.5e-02
timing of 100 NxN convolution(s), with N = 1000: 100x0.000006s
accuracy of an MxN convolution, with M = 1000, N = 1000: mean err=4.0e-03, max err=2.2e-02
timing of 100 NxN convolution(s), with N = 2000: 100x0.000007s
accuracy of an MxN convolution, with M = 1000, N = 2000: mean err=1.5e-03, max err=6.8e-03
timing of 100 NxN convolution(s), with N = 5000: 100x0.000007s
accuracy of an MxN convolution, with M = 1000, N = 5000: mean err=2.1e-03, max err=9.7e-03
timing of 100 NxN convolution(s), with N = 10000: 100x0.000008s
accuracy of an MxN convolution, with M = 1000, N = 10000: mean err=1.5e-03, max err=7.3e-03
timing of 100 NxN convolution(s), with N = 20000: 100x0.000011s
accuracy of an MxN convolution, with M = 1000, N = 20000: mean err=1.9e-03, max err=9.0e-03
timing of 100 NxN convolution(s), with N = 50000: 100x0.000024s
accuracy of an MxN convolution, with M = 1000, N = 50000: mean err=2.3e-03, max err=1.6e-02
timing of 10 NxN convolution(s), with N = 100000: 10x0.000719s
accuracy of an MxN convolution, with M = 1000, N = 100000: mean err=3.8e-03, max err=1.8e-02
timing of 10 NxN convolution(s), with N = 200000: 10x0.002693s
accuracy of an MxN convolution, with M = 1000, N = 200000: mean err=5.0e-03, max err=2.3e-02
timing of 1 NxN convolution(s), with N = 500000: 1x0.162493s
accuracy of an MxN convolution, with M = 1000, N = 500000: mean err=1.5e-03, max err=7.3e-03
timing of 1 NxN convolution(s), with N =1000000: 1x0.631087s
accuracy of an MxN convolution, with M = 1000, N =1000000: mean err=2.5e-03, max err=1.3e-02
timing of 1 NxN convolution(s), with N =2000000: 1x2.500815s
accuracy of an MxN convolution, with M = 1000, N =2000000: mean err=1.8e-03, max err=8.0e-03
timing of 1 NxN convolution(s), with N =5000000: 1x15.637593s
accuracy of an MxN convolution, with M = 1000, N =5000000: mean err=2.2e-03, max err=9.8e-03
Backend : float32, direct_sum, Device : cuda, dtype : float32, dtype_acc : float32, sum_scheme : direct_sum -------------
timing of 100 NxN convolution(s), with N = 100: 100x0.000003s
accuracy of an MxN convolution, with M = 100, N = 100: mean err=1.4e-07, max err=5.8e-07
timing of 100 NxN convolution(s), with N = 200: 100x0.000003s
accuracy of an MxN convolution, with M = 200, N = 200: mean err=4.2e-07, max err=1.7e-06
timing of 100 NxN convolution(s), with N = 500: 100x0.000003s
accuracy of an MxN convolution, with M = 500, N = 500: mean err=4.0e-07, max err=2.4e-06
timing of 100 NxN convolution(s), with N = 1000: 100x0.000003s
accuracy of an MxN convolution, with M = 1000, N = 1000: mean err=6.4e-07, max err=3.8e-06
timing of 100 NxN convolution(s), with N = 2000: 100x0.000004s
accuracy of an MxN convolution, with M = 1000, N = 2000: mean err=5.0e-07, max err=5.3e-06
timing of 100 NxN convolution(s), with N = 5000: 100x0.000004s
accuracy of an MxN convolution, with M = 1000, N = 5000: mean err=1.1e-06, max err=6.5e-06
timing of 100 NxN convolution(s), with N = 10000: 100x0.000005s
accuracy of an MxN convolution, with M = 1000, N = 10000: mean err=1.7e-06, max err=1.1e-05
timing of 100 NxN convolution(s), with N = 20000: 100x0.000010s
accuracy of an MxN convolution, with M = 1000, N = 20000: mean err=1.7e-06, max err=1.1e-05
timing of 100 NxN convolution(s), with N = 50000: 100x0.000035s
accuracy of an MxN convolution, with M = 1000, N = 50000: mean err=4.0e-06, max err=2.3e-05
timing of 10 NxN convolution(s), with N = 100000: 10x0.001268s
accuracy of an MxN convolution, with M = 1000, N = 100000: mean err=6.0e-06, max err=3.9e-05
timing of 10 NxN convolution(s), with N = 200000: 10x0.004954s
accuracy of an MxN convolution, with M = 1000, N = 200000: mean err=7.3e-06, max err=4.8e-05
timing of 1 NxN convolution(s), with N = 500000: 1x0.296531s
accuracy of an MxN convolution, with M = 1000, N = 500000: mean err=1.1e-05, max err=6.8e-05
timing of 1 NxN convolution(s), with N =1000000: 1x1.180207s
accuracy of an MxN convolution, with M = 1000, N =1000000: mean err=1.4e-05, max err=8.6e-05
timing of 1 NxN convolution(s), with N =2000000: 1x4.712267s
accuracy of an MxN convolution, with M = 1000, N =2000000: mean err=1.6e-05, max err=9.7e-05
timing of 1 NxN convolution(s), with N =5000000: 1x29.603041s
accuracy of an MxN convolution, with M = 1000, N =5000000: mean err=4.9e-05, max err=2.3e-04
Backend : float32, block_sum, Device : cuda, dtype : float32, dtype_acc : float32, sum_scheme : block_sum -------------
timing of 100 NxN convolution(s), with N = 100: 100x0.000003s
accuracy of an MxN convolution, with M = 100, N = 100: mean err=1.4e-07, max err=5.8e-07
timing of 100 NxN convolution(s), with N = 200: 100x0.000003s
accuracy of an MxN convolution, with M = 200, N = 200: mean err=4.2e-07, max err=1.8e-06
timing of 100 NxN convolution(s), with N = 500: 100x0.000003s
accuracy of an MxN convolution, with M = 500, N = 500: mean err=3.0e-07, max err=1.6e-06
timing of 100 NxN convolution(s), with N = 1000: 100x0.000003s
accuracy of an MxN convolution, with M = 1000, N = 1000: mean err=4.3e-07, max err=2.5e-06
timing of 100 NxN convolution(s), with N = 2000: 100x0.000004s
accuracy of an MxN convolution, with M = 1000, N = 2000: mean err=1.4e-07, max err=7.6e-07
timing of 100 NxN convolution(s), with N = 5000: 100x0.000004s
accuracy of an MxN convolution, with M = 1000, N = 5000: mean err=2.8e-07, max err=1.5e-06
timing of 100 NxN convolution(s), with N = 10000: 100x0.000006s
accuracy of an MxN convolution, with M = 1000, N = 10000: mean err=2.2e-07, max err=1.1e-06
timing of 100 NxN convolution(s), with N = 20000: 100x0.000010s
accuracy of an MxN convolution, with M = 1000, N = 20000: mean err=2.8e-07, max err=1.5e-06
timing of 100 NxN convolution(s), with N = 50000: 100x0.000036s
accuracy of an MxN convolution, with M = 1000, N = 50000: mean err=3.9e-07, max err=2.4e-06
timing of 10 NxN convolution(s), with N = 100000: 10x0.001279s
accuracy of an MxN convolution, with M = 1000, N = 100000: mean err=6.7e-07, max err=4.6e-06
timing of 10 NxN convolution(s), with N = 200000: 10x0.004966s
accuracy of an MxN convolution, with M = 1000, N = 200000: mean err=8.2e-07, max err=4.3e-06
timing of 1 NxN convolution(s), with N = 500000: 1x0.298102s
accuracy of an MxN convolution, with M = 1000, N = 500000: mean err=8.0e-07, max err=4.5e-06
timing of 1 NxN convolution(s), with N =1000000: 1x1.186337s
accuracy of an MxN convolution, with M = 1000, N =1000000: mean err=1.0e-06, max err=6.6e-06
timing of 1 NxN convolution(s), with N =2000000: 1x4.735737s
accuracy of an MxN convolution, with M = 1000, N =2000000: mean err=1.2e-06, max err=8.2e-06
timing of 1 NxN convolution(s), with N =5000000: 1x29.628445s
accuracy of an MxN convolution, with M = 1000, N =5000000: mean err=3.3e-06, max err=2.0e-05
Backend : float32, kahan_scheme, Device : cuda, dtype : float32, dtype_acc : float32, sum_scheme : kahan_scheme -------------
timing of 100 NxN convolution(s), with N = 100: 100x0.000003s
accuracy of an MxN convolution, with M = 100, N = 100: mean err=8.6e-08, max err=3.2e-07
timing of 100 NxN convolution(s), with N = 200: 100x0.000003s
accuracy of an MxN convolution, with M = 200, N = 200: mean err=1.2e-07, max err=4.2e-07
timing of 100 NxN convolution(s), with N = 500: 100x0.000003s
accuracy of an MxN convolution, with M = 500, N = 500: mean err=1.2e-07, max err=6.3e-07
timing of 100 NxN convolution(s), with N = 1000: 100x0.000003s
accuracy of an MxN convolution, with M = 1000, N = 1000: mean err=1.3e-07, max err=7.1e-07
timing of 100 NxN convolution(s), with N = 2000: 100x0.000004s
accuracy of an MxN convolution, with M = 1000, N = 2000: mean err=5.4e-08, max err=3.6e-07
timing of 100 NxN convolution(s), with N = 5000: 100x0.000005s
accuracy of an MxN convolution, with M = 1000, N = 5000: mean err=9.0e-08, max err=5.0e-07
timing of 100 NxN convolution(s), with N = 10000: 100x0.000006s
accuracy of an MxN convolution, with M = 1000, N = 10000: mean err=6.3e-08, max err=3.2e-07
timing of 100 NxN convolution(s), with N = 20000: 100x0.000012s
accuracy of an MxN convolution, with M = 1000, N = 20000: mean err=6.5e-08, max err=3.3e-07
timing of 100 NxN convolution(s), with N = 50000: 100x0.000048s
accuracy of an MxN convolution, with M = 1000, N = 50000: mean err=7.1e-08, max err=3.8e-07
timing of 10 NxN convolution(s), with N = 100000: 10x0.001761s
accuracy of an MxN convolution, with M = 1000, N = 100000: mean err=1.3e-07, max err=5.6e-07
timing of 10 NxN convolution(s), with N = 200000: 10x0.006880s
accuracy of an MxN convolution, with M = 1000, N = 200000: mean err=1.7e-07, max err=8.1e-07
timing of 1 NxN convolution(s), with N = 500000: 1x0.417349s
accuracy of an MxN convolution, with M = 1000, N = 500000: mean err=6.5e-08, max err=3.3e-07
timing of 1 NxN convolution(s), with N =1000000: 1x1.649258s
accuracy of an MxN convolution, with M = 1000, N =1000000: mean err=7.7e-08, max err=3.7e-07
timing of 1 NxN convolution(s), with N =2000000: 1x6.571167s
accuracy of an MxN convolution, with M = 1000, N =2000000: mean err=7.6e-08, max err=3.9e-07
timing of 1 NxN convolution(s), with N =5000000: 1x41.027161s
accuracy of an MxN convolution, with M = 1000, N =5000000: mean err=8.0e-08, max err=4.1e-07
Backend : float32, float64 acc, Device : cuda, dtype : float32, dtype_acc : float64, sum_scheme : block_sum -------------
timing of 100 NxN convolution(s), with N = 100: 100x0.000003s
accuracy of an MxN convolution, with M = 100, N = 100: mean err=1.4e-07, max err=5.8e-07
timing of 100 NxN convolution(s), with N = 200: 100x0.000003s
accuracy of an MxN convolution, with M = 200, N = 200: mean err=4.2e-07, max err=1.8e-06
timing of 100 NxN convolution(s), with N = 500: 100x0.000003s
accuracy of an MxN convolution, with M = 500, N = 500: mean err=3.0e-07, max err=1.6e-06
timing of 100 NxN convolution(s), with N = 1000: 100x0.000003s
accuracy of an MxN convolution, with M = 1000, N = 1000: mean err=4.3e-07, max err=2.5e-06
timing of 100 NxN convolution(s), with N = 2000: 100x0.000004s
accuracy of an MxN convolution, with M = 1000, N = 2000: mean err=1.4e-07, max err=7.6e-07
timing of 100 NxN convolution(s), with N = 5000: 100x0.000004s
accuracy of an MxN convolution, with M = 1000, N = 5000: mean err=2.6e-07, max err=1.5e-06
timing of 100 NxN convolution(s), with N = 10000: 100x0.000005s
accuracy of an MxN convolution, with M = 1000, N = 10000: mean err=1.8e-07, max err=1.0e-06
timing of 100 NxN convolution(s), with N = 20000: 100x0.000010s
accuracy of an MxN convolution, with M = 1000, N = 20000: mean err=2.5e-07, max err=1.3e-06
timing of 100 NxN convolution(s), with N = 50000: 100x0.000035s
accuracy of an MxN convolution, with M = 1000, N = 50000: mean err=2.8e-07, max err=1.7e-06
timing of 10 NxN convolution(s), with N = 100000: 10x0.001273s
accuracy of an MxN convolution, with M = 1000, N = 100000: mean err=4.8e-07, max err=2.2e-06
timing of 10 NxN convolution(s), with N = 200000: 10x0.004963s
accuracy of an MxN convolution, with M = 1000, N = 200000: mean err=6.1e-07, max err=3.1e-06
timing of 1 NxN convolution(s), with N = 500000: 1x0.297461s
accuracy of an MxN convolution, with M = 1000, N = 500000: mean err=1.8e-07, max err=8.7e-07
timing of 1 NxN convolution(s), with N =1000000: 1x1.187766s
accuracy of an MxN convolution, with M = 1000, N =1000000: mean err=3.1e-07, max err=1.8e-06
timing of 1 NxN convolution(s), with N =2000000: 1x4.754791s
accuracy of an MxN convolution, with M = 1000, N =2000000: mean err=2.2e-07, max err=1.3e-06
timing of 1 NxN convolution(s), with N =5000000: 1x29.649347s
accuracy of an MxN convolution, with M = 1000, N =5000000: mean err=2.6e-07, max err=1.2e-06
Backend : float64, direct_sum, Device : cuda, dtype : float64, dtype_acc : float64, sum_scheme : direct_sum -------------
timing of 100 NxN convolution(s), with N = 100: 100x0.000003s
accuracy of an MxN convolution, with M = 100, N = 100: mean err=2.1e-16, max err=1.3e-15
timing of 100 NxN convolution(s), with N = 200: 100x0.000004s
accuracy of an MxN convolution, with M = 200, N = 200: mean err=8.7e-16, max err=3.7e-15
timing of 100 NxN convolution(s), with N = 500: 100x0.000005s
accuracy of an MxN convolution, with M = 500, N = 500: mean err=8.1e-16, max err=4.7e-15
timing of 100 NxN convolution(s), with N = 1000: 100x0.000007s
accuracy of an MxN convolution, with M = 1000, N = 1000: mean err=1.2e-15, max err=6.0e-15
timing of 100 NxN convolution(s), with N = 2000: 100x0.000011s
accuracy of an MxN convolution, with M = 1000, N = 2000: mean err=9.2e-16, max err=6.2e-15
timing of 100 NxN convolution(s), with N = 5000: 100x0.000023s
accuracy of an MxN convolution, with M = 1000, N = 5000: mean err=2.0e-15, max err=1.4e-14
timing of 10 NxN convolution(s), with N = 10000: 10x0.000433s
accuracy of an MxN convolution, with M = 1000, N = 10000: mean err=3.1e-15, max err=1.8e-14
timing of 10 NxN convolution(s), with N = 20000: 10x0.001626s
accuracy of an MxN convolution, with M = 1000, N = 20000: mean err=3.1e-15, max err=2.1e-14
timing of 10 NxN convolution(s), with N = 50000: 10x0.007993s
accuracy of an MxN convolution, with M = 1000, N = 50000: mean err=7.1e-15, max err=7.1e-14
timing of 1 NxN convolution(s), with N = 100000: 1x0.318433s
accuracy of an MxN convolution, with M = 1000, N = 100000: mean err=1.1e-14, max err=6.3e-14
timing of 1 NxN convolution(s), with N = 200000: 1x1.271697s
accuracy of an MxN convolution, with M = 1000, N = 200000: mean err=1.4e-14, max err=6.5e-14
timing of 1 NxN convolution(s), with N = 500000: 1x7.751096s
accuracy of an MxN convolution, with M = 1000, N = 500000: mean err=2.0e-14, max err=1.4e-13
timing of 1 NxN convolution(s), with N =1000000: 1x30.585474s
accuracy of an MxN convolution, with M = 1000, N =1000000: mean err=2.4e-14, max err=1.5e-13
accuracy of an MxN convolution, with M = 1000, N =2000000: mean err=3.1e-14, max err=2.1e-13
accuracy of an MxN convolution, with M = 1000, N =5000000: mean err=8.7e-14, max err=4.4e-13
Backend : float64, block_sum, Device : cuda, dtype : float64, dtype_acc : float64, sum_scheme : block_sum -------------
timing of 100 NxN convolution(s), with N = 100: 100x0.000003s
accuracy of an MxN convolution, with M = 100, N = 100: mean err=2.1e-16, max err=1.3e-15
timing of 100 NxN convolution(s), with N = 200: 100x0.000004s
accuracy of an MxN convolution, with M = 200, N = 200: mean err=8.7e-16, max err=3.7e-15
timing of 100 NxN convolution(s), with N = 500: 100x0.000005s
accuracy of an MxN convolution, with M = 500, N = 500: mean err=5.6e-16, max err=2.8e-15
timing of 100 NxN convolution(s), with N = 1000: 100x0.000007s
accuracy of an MxN convolution, with M = 1000, N = 1000: mean err=7.7e-16, max err=3.5e-15
timing of 100 NxN convolution(s), with N = 2000: 100x0.000011s
accuracy of an MxN convolution, with M = 1000, N = 2000: mean err=2.5e-16, max err=1.5e-15
timing of 100 NxN convolution(s), with N = 5000: 100x0.000023s
accuracy of an MxN convolution, with M = 1000, N = 5000: mean err=4.8e-16, max err=2.6e-15
timing of 10 NxN convolution(s), with N = 10000: 10x0.000431s
accuracy of an MxN convolution, with M = 1000, N = 10000: mean err=4.0e-16, max err=2.2e-15
timing of 10 NxN convolution(s), with N = 20000: 10x0.001619s
accuracy of an MxN convolution, with M = 1000, N = 20000: mean err=5.0e-16, max err=3.0e-15
timing of 10 NxN convolution(s), with N = 50000: 10x0.007939s
accuracy of an MxN convolution, with M = 1000, N = 50000: mean err=7.4e-16, max err=4.7e-15
timing of 1 NxN convolution(s), with N = 100000: 1x0.316048s
accuracy of an MxN convolution, with M = 1000, N = 100000: mean err=1.2e-15, max err=6.9e-15
timing of 1 NxN convolution(s), with N = 200000: 1x1.262631s
accuracy of an MxN convolution, with M = 1000, N = 200000: mean err=1.4e-15, max err=7.1e-15
timing of 1 NxN convolution(s), with N = 500000: 1x7.690921s
accuracy of an MxN convolution, with M = 1000, N = 500000: mean err=1.6e-15, max err=6.9e-15
timing of 1 NxN convolution(s), with N =1000000: 1x30.368532s
accuracy of an MxN convolution, with M = 1000, N =1000000: mean err=1.8e-15, max err=1.2e-14
accuracy of an MxN convolution, with M = 1000, N =2000000: mean err=2.3e-15, max err=1.5e-14
accuracy of an MxN convolution, with M = 1000, N =5000000: mean err=6.5e-15, max err=3.7e-14
Backend : float64, kahan_scheme, Device : cuda, dtype : float64, dtype_acc : float64, sum_scheme : kahan_scheme -------------
timing of 100 NxN convolution(s), with N = 100: 100x0.000004s
accuracy of an MxN convolution, with M = 100, N = 100: mean err=7.9e-17, max err=4.5e-16
timing of 100 NxN convolution(s), with N = 200: 100x0.000004s
accuracy of an MxN convolution, with M = 200, N = 200: mean err=1.3e-16, max err=6.0e-16
timing of 100 NxN convolution(s), with N = 500: 100x0.000006s
accuracy of an MxN convolution, with M = 500, N = 500: mean err=1.2e-16, max err=5.5e-16
timing of 100 NxN convolution(s), with N = 1000: 100x0.000009s
accuracy of an MxN convolution, with M = 1000, N = 1000: mean err=1.3e-16, max err=5.5e-16
timing of 100 NxN convolution(s), with N = 2000: 100x0.000015s
accuracy of an MxN convolution, with M = 1000, N = 2000: mean err=4.5e-17, max err=4.3e-16
timing of 100 NxN convolution(s), with N = 5000: 100x0.000032s
accuracy of an MxN convolution, with M = 1000, N = 5000: mean err=6.9e-17, max err=4.4e-16
timing of 10 NxN convolution(s), with N = 10000: 10x0.000610s
accuracy of an MxN convolution, with M = 1000, N = 10000: mean err=4.6e-17, max err=4.5e-16
timing of 10 NxN convolution(s), with N = 20000: 10x0.002338s
accuracy of an MxN convolution, with M = 1000, N = 20000: mean err=5.6e-17, max err=3.8e-16
timing of 1 NxN convolution(s), with N = 50000: 1x0.115381s
accuracy of an MxN convolution, with M = 1000, N = 50000: mean err=6.9e-17, max err=5.9e-16
timing of 1 NxN convolution(s), with N = 100000: 1x0.460229s
accuracy of an MxN convolution, with M = 1000, N = 100000: mean err=1.2e-16, max err=6.9e-16
timing of 1 NxN convolution(s), with N = 200000: 1x1.839525s
accuracy of an MxN convolution, with M = 1000, N = 200000: mean err=1.5e-16, max err=8.1e-16
timing of 1 NxN convolution(s), with N = 500000: 1x11.208046s
accuracy of an MxN convolution, with M = 1000, N = 500000: mean err=4.1e-17, max err=2.4e-16
accuracy of an MxN convolution, with M = 1000, N =1000000: mean err=7.7e-17, max err=5.6e-16
accuracy of an MxN convolution, with M = 1000, N =2000000: mean err=5.5e-17, max err=5.8e-16
accuracy of an MxN convolution, with M = 1000, N =5000000: mean err=6.9e-17, max err=4.3e-16
```

**Total running time of the script:** ( 7 minutes 3.279 seconds)