Note

Click here to download the full example code

# Scaling up Gaussian convolutions on 3D point clouds¶

Let’s compare the performances of PyTorch and KeOps on simple Gaussian RBF kernel products, as the number of samples grows from 100 to 1,000,000.

Note

In this demo, we use exact **bruteforce** computations
(tensorized for PyTorch and online for KeOps), without leveraging any multiscale
or low-rank (multipole) decomposition of the Kernel matrix.
Please visit the documentation of the GeomLoss package
for a discussion of clever, scalable schemes.

## Setup¶

```
import importlib
import os
import time
import numpy as np
import torch
from matplotlib import pyplot as plt
use_cuda = torch.cuda.is_available()
```

Benchmark specifications:

```
D = 3 # Let's do this in 3D
MAXTIME = 10 if use_cuda else 1 # Max number of seconds before we break the loop
REDTIME = 2 if use_cuda else .2 # Decrease the number of runs if computations take longer than 2s...
# Number of samples that we'll loop upon
NS = [100, 200, 500,
1000, 2000, 5000,
10000, 20000, 50000,
100000, 200000, 500000,
1000000]
```

Synthetic dataset. Feel free to use a Stanford Bunny, or whatever!

```
def generate_samples(N, device, lang, batchsize=None):
"""Create point clouds sampled non-uniformly on a sphere of diameter 1."""
B = () if batchsize is None else (batchsize,)
if lang == 'torch':
if device == 'cuda':
torch.cuda.manual_seed_all(1234)
else:
torch.manual_seed(1234)
x = torch.randn(B + (N, D), device=device)
x[:,0] += 1
x = x / (2*x.norm(dim=1,keepdim=True))
y = torch.randn(B + (N, D), device=device)
y[:,1] += 2
y = y / (2*y.norm(dim=1,keepdim=True))
# Draw a random source signal:
b = torch.randn(B + (N, 1), device=device)
else:
np.random.seed(1234)
x = np.random.rand(*(B + (N, D))).astype('float32')
y = np.random.rand(*(B + (N, D))).astype('float32')
b = np.random.randn(*(B + (N,))).astype('float32')
return x, y, b
```

Define a simple Gaussian RBF product, using a **tensorized** implementation:

```
def gaussianconv_numpy(x, y, b):
K_xy = np.exp( - np.sum( (x[:,None,:] - y[None,:,:]) **2, axis=2) )
return K_xy@b
def gaussianconv_pytorch(x, y, b):
D_xx = (x*x).sum(-1).unsqueeze(1) # (N,1)
D_xy = torch.matmul( x, y.permute(1,0) ) # (N,D) @ (D,M) = (N,M)
D_yy = (y*y).sum(-1).unsqueeze(0) # (1,M)
D_xy = D_xx - 2*D_xy + D_yy
K_xy = (-D_xy).exp()
return K_xy @ b
```

Define a simple Gaussian RBF product, using an **online** implementation:

```
from pykeops.torch import generic_sum
def gaussianconv_keops(x, y, b):
fun = generic_sum("Exp(-SqDist(X,Y)) * B", # Formula
"A = Vi(1)", # Output
"X = Vi({})".format(D), # 1st argument
"Y = Vj({})".format(D), # 2nd argument
"B = Vj(1)" ) # 3rd argument
backend = 'GPU' if use_cuda else 'CPU'
return fun(x, y, b, backend=backend)
```

Finally, perform the same operation with our high-level `pykeops.torch.LazyTensor`

wrapper:

```
from pykeops.torch import LazyTensor
def gaussianconv_lazytensor(x, y, b):
backend = 'GPU' if use_cuda else 'CPU'
nbatchdims = len(x.shape) - 2
x_i = LazyTensor(x.unsqueeze(-2)) # (B, M, 1, D)
y_j = LazyTensor(y.unsqueeze(-3)) # (B, 1, N, D)
D_ij = ((x_i - y_j) ** 2).sum(-1) # (B, M, N, 1)
K_ij = (- D_ij).exp() # (B, M, N, 1)
S_ij = K_ij * b.unsqueeze(-3) # (B, M, N, 1) * (B, 1, N, 1)
return S_ij.sum(dim=nbatchdims + 1, backend=backend)
```

## Benchmarking loops¶

```
def benchmark(routine_batchsize, dev, N, loops=10, lang='torch'):
"""Times a convolution on an N-by-N problem."""
if isinstance(routine_batchsize, tuple):
Routine, B = routine_batchsize
else:
Routine, B = routine_batchsize, None
importlib.reload(torch) # In case we had a memory overflow just before...
device = torch.device(dev)
x, y, b = generate_samples(N, device, lang, batchsize=B)
# We simply benchmark a convolution
code = "a = Routine( x, y, b ) "
exec( code, locals() ) # Warmup run, to compile and load everything
t_0 = time.perf_counter() # Actual benchmark --------------------
if use_cuda: torch.cuda.synchronize()
for i in range(loops):
exec( code, locals() )
if use_cuda: torch.cuda.synchronize()
elapsed = time.perf_counter() - t_0 # ---------------------------
if B is None:
print("{:3} NxN convolution, with N ={:7}: {:3}x{:3.6f}s".format(loops, N, loops, elapsed / loops))
return elapsed / loops
else:
print("{:3}x{:3} NxN convolution, with N ={:7}: {:3}x{:3}x{:3.6f}s".format(
B, loops, N, B, loops, elapsed / (B * loops)))
return elapsed / (B * loops)
def bench_config(Routine, backend, dev, l) :
"""Times a convolution for an increasing number of samples."""
print("Backend : {}, Device : {} -------------".format(backend, dev))
times = []
try :
Nloops = [100, 10, 1]
nloops = Nloops.pop(0)
for n in NS :
elapsed = benchmark(Routine, dev, n, loops=nloops, lang=l)
times.append( elapsed )
if (nloops * elapsed > MAXTIME) \
or (nloops * elapsed > REDTIME/10 and len(Nloops) > 0 ) :
nloops = Nloops.pop(0)
except RuntimeError :
print("**\nMemory overflow !")
except IndexError :
print("**\nToo slow !")
return times + (len(NS)-len(times)) * [np.nan]
def full_bench(title, routines) :
"""Benchmarks the varied backends of a geometric loss function."""
backends = [ backend for (_, backend, _) in routines ]
print("Benchmarking : {} ===============================".format(title))
lines = [ NS ]
for routine, backend, lang in routines :
lines.append( bench_config(routine, backend, "cuda" if use_cuda else "cpu", lang) )
benches = np.array(lines).T
# Creates a pyplot figure:
plt.figure(figsize=(12,8))
linestyles = ["o-", "s-", "^-"]
for i, backend in enumerate(backends):
plt.plot( benches[:,0], benches[:,i+1], linestyles[i],
linewidth=2, label='backend = "{}"'.format(backend) )
for (j, val) in enumerate( benches[:,i+1] ):
if np.isnan(val) and j > 0:
x, y = benches[j-1,0], benches[j-1,i+1]
plt.annotate('Memory overflow!',
xy=(x, 1.05*y),
horizontalalignment='center',
verticalalignment='bottom')
break
plt.title('Runtimes for {} in dimension {}'.format(title, D))
plt.xlabel('Number of samples')
plt.ylabel('Seconds')
plt.yscale('log') ; plt.xscale('log')
plt.legend(loc='upper left')
plt.grid(True, which="major", linestyle="-")
plt.grid(True, which="minor", linestyle="dotted")
plt.axis([NS[0], NS[-1], 1e-5, MAXTIME])
plt.tight_layout()
# Save as a .csv to put a nice Tikz figure in the papers:
header = "Npoints " + " ".join(backends)
os.makedirs("output", exist_ok=True)
np.savetxt("output/benchmark_convolutions_3D.csv", benches,
fmt='%-9.5f', header=header, comments='')
```

## NumPy vs. PyTorch vs. KeOps (Gpu)¶

```
if use_cuda:
routines = [ (gaussianconv_numpy, "Numpy (Cpu)", "numpy"),
(gaussianconv_pytorch, "PyTorch (Gpu)", "torch"),
(gaussianconv_keops, "KeOps (Gpu)", "torch"), ]
full_bench( "Gaussian Matrix-Vector products", routines )
```

Out:

```
Benchmarking : Gaussian Matrix-Vector products ===============================
Backend : Numpy (Cpu), Device : cuda -------------
100 NxN convolution, with N = 100: 100x0.000321s
100 NxN convolution, with N = 200: 100x0.001779s
100 NxN convolution, with N = 500: 100x0.009941s
10 NxN convolution, with N = 1000: 10x0.038909s
1 NxN convolution, with N = 2000: 1x0.142323s
1 NxN convolution, with N = 5000: 1x0.751217s
1 NxN convolution, with N = 10000: 1x3.015973s
1 NxN convolution, with N = 20000: 1x11.972197s
**
Too slow !
Backend : PyTorch (Gpu), Device : cuda -------------
100 NxN convolution, with N = 100: 100x0.000220s
100 NxN convolution, with N = 200: 100x0.000221s
100 NxN convolution, with N = 500: 100x0.000196s
100 NxN convolution, with N = 1000: 100x0.000163s
100 NxN convolution, with N = 2000: 100x0.000406s
100 NxN convolution, with N = 5000: 100x0.002281s
10 NxN convolution, with N = 10000: 10x0.009821s
10 NxN convolution, with N = 20000: 10x0.039236s
**
Memory overflow !
Backend : KeOps (Gpu), Device : cuda -------------
100 NxN convolution, with N = 100: 100x0.000134s
100 NxN convolution, with N = 200: 100x0.000135s
100 NxN convolution, with N = 500: 100x0.000143s
100 NxN convolution, with N = 1000: 100x0.000158s
100 NxN convolution, with N = 2000: 100x0.000183s
100 NxN convolution, with N = 5000: 100x0.000261s
100 NxN convolution, with N = 10000: 100x0.000384s
100 NxN convolution, with N = 20000: 100x0.000894s
100 NxN convolution, with N = 50000: 100x0.003867s
10 NxN convolution, with N = 100000: 10x0.014448s
10 NxN convolution, with N = 200000: 10x0.056979s
1 NxN convolution, with N = 500000: 1x0.343881s
1 NxN convolution, with N =1000000: 1x1.360110s
```

## NumPy vs. PyTorch vs. KeOps (Cpu)¶

```
use_cuda = False
routines = [ (gaussianconv_numpy, "Numpy (Cpu)", "numpy"),
(gaussianconv_pytorch, "PyTorch (Cpu)", "torch"),
(gaussianconv_keops, "KeOps (Cpu)", "torch"), ]
full_bench( "Gaussian Matrix-Vector products", routines )
```

Out:

```
Benchmarking : Gaussian Matrix-Vector products ===============================
Backend : Numpy (Cpu), Device : cpu -------------
100 NxN convolution, with N = 100: 100x0.000455s
100 NxN convolution, with N = 200: 100x0.001780s
100 NxN convolution, with N = 500: 100x0.010031s
10 NxN convolution, with N = 1000: 10x0.038903s
1 NxN convolution, with N = 2000: 1x0.145803s
1 NxN convolution, with N = 5000: 1x0.759431s
1 NxN convolution, with N = 10000: 1x2.973640s
1 NxN convolution, with N = 20000: 1x11.853082s
**
Too slow !
Backend : PyTorch (Cpu), Device : cpu -------------
100 NxN convolution, with N = 100: 100x0.000168s
100 NxN convolution, with N = 200: 100x0.001058s
100 NxN convolution, with N = 500: 100x0.026957s
10 NxN convolution, with N = 1000: 10x0.058643s
1 NxN convolution, with N = 2000: 1x0.005653s
1 NxN convolution, with N = 5000: 1x0.103791s
1 NxN convolution, with N = 10000: 1x0.369219s
1 NxN convolution, with N = 20000: 1x1.428481s
1 NxN convolution, with N = 50000: 1x7.557542s
1 NxN convolution, with N = 100000: 1x32.228860s
**
Too slow !
Backend : KeOps (Cpu), Device : cpu -------------
100 NxN convolution, with N = 100: 100x0.000127s
100 NxN convolution, with N = 200: 100x0.000138s
100 NxN convolution, with N = 500: 100x0.000182s
100 NxN convolution, with N = 1000: 100x0.000388s
100 NxN convolution, with N = 2000: 100x0.001334s
100 NxN convolution, with N = 5000: 100x0.007250s
10 NxN convolution, with N = 10000: 10x0.027697s
1 NxN convolution, with N = 20000: 1x0.123920s
1 NxN convolution, with N = 50000: 1x0.729527s
1 NxN convolution, with N = 100000: 1x2.395174s
1 NxN convolution, with N = 200000: 1x7.276235s
1 NxN convolution, with N = 500000: 1x44.383240s
**
Too slow !
```

## Genred vs. LazyTensor vs. batched LazyTensor¶

```
use_cuda = torch.cuda.is_available()
routines = [(gaussianconv_keops, "KeOps (Genred)", "torch"),
(gaussianconv_lazytensor, "KeOps (LazyTensor)", "torch"),
((gaussianconv_lazytensor, 10), "KeOps (LazyTensor, batchsize=10)", "torch"), ]
full_bench( "Gaussian Matrix-Vector products", routines )
plt.show()
```

Out:

```
Benchmarking : Gaussian Matrix-Vector products ===============================
Backend : KeOps (Genred), Device : cuda -------------
100 NxN convolution, with N = 100: 100x0.000137s
100 NxN convolution, with N = 200: 100x0.000142s
100 NxN convolution, with N = 500: 100x0.000156s
100 NxN convolution, with N = 1000: 100x0.000172s
100 NxN convolution, with N = 2000: 100x0.000214s
100 NxN convolution, with N = 5000: 100x0.000324s
100 NxN convolution, with N = 10000: 100x0.000502s
100 NxN convolution, with N = 20000: 100x0.001087s
100 NxN convolution, with N = 50000: 100x0.003794s
10 NxN convolution, with N = 100000: 10x0.014195s
10 NxN convolution, with N = 200000: 10x0.055831s
1 NxN convolution, with N = 500000: 1x0.336310s
1 NxN convolution, with N =1000000: 1x1.335842s
Backend : KeOps (LazyTensor), Device : cuda -------------
100 NxN convolution, with N = 100: 100x0.000295s
100 NxN convolution, with N = 200: 100x0.000295s
100 NxN convolution, with N = 500: 100x0.000309s
100 NxN convolution, with N = 1000: 100x0.000317s
100 NxN convolution, with N = 2000: 100x0.000343s
100 NxN convolution, with N = 5000: 100x0.000419s
100 NxN convolution, with N = 10000: 100x0.000544s
100 NxN convolution, with N = 20000: 100x0.001045s
100 NxN convolution, with N = 50000: 100x0.003949s
10 NxN convolution, with N = 100000: 10x0.014410s
10 NxN convolution, with N = 200000: 10x0.056368s
1 NxN convolution, with N = 500000: 1x0.339090s
1 NxN convolution, with N =1000000: 1x1.340441s
Backend : KeOps (LazyTensor, batchsize=10), Device : cuda -------------
10x100 NxN convolution, with N = 100: 10x100x0.000071s
10x100 NxN convolution, with N = 200: 10x100x0.000070s
10x100 NxN convolution, with N = 500: 10x100x0.000071s
10x100 NxN convolution, with N = 1000: 10x100x0.000073s
10x100 NxN convolution, with N = 2000: 10x100x0.000078s
10x100 NxN convolution, with N = 5000: 10x100x0.000105s
10x100 NxN convolution, with N = 10000: 10x100x0.000204s
10x100 NxN convolution, with N = 20000: 10x100x0.000626s
10x100 NxN convolution, with N = 50000: 10x100x0.003423s
10x 10 NxN convolution, with N = 100000: 10x 10x0.013353s
10x 10 NxN convolution, with N = 200000: 10x 10x0.053345s
10x 1 NxN convolution, with N = 500000: 10x 1x0.335074s
10x 1 NxN convolution, with N =1000000: 10x 1x1.355529s
/home/bcharlier/keops/pykeops/benchmarks/plot_benchmarks_convolutions_3D.py:275: UserWarning: Matplotlib is currently using agg, which is a non-GUI backend, so cannot show the figure.
plt.show()
```

**Total running time of the script:** ( 5 minutes 31.032 seconds)