# Solving positive definite linear systems¶

This benchmark compares the performances of KeOps versus Numpy and Pytorch on a inverse matrix operation. It uses the functions torch.KernelSolve (see also here) and numpy.KernelSolve (see also here).

In a nutshell, given $$x \in\mathbb R^{N\times D}$$ and $$b \in \mathbb R^{N\times D_v}$$, we compute $$a \in \mathbb R^{N\times D_v}$$ so that

$b = (\alpha\operatorname{Id} + K_{x,x}) a \quad \Leftrightarrow \quad a = (\alpha\operatorname{Id}+ K_{x,x})^{-1} b$

where $$K_{x,x} = \Big[\exp(-\|x_i -x_j\|^2 / \sigma^2)\Big]_{i,j=1}^N$$. The method is based on a conjugate gradient scheme. The benchmark tests various values of $$N \in [10, \cdots,10^6]$$.

## Setup¶

Standard imports:

import importlib
import os
import time

import numpy as np
import torch
from matplotlib import pyplot as plt

from scipy.sparse import diags
from scipy.sparse.linalg import aslinearoperator, cg
from scipy.sparse.linalg.interface import IdentityOperator

from pykeops.numpy import KernelSolve as KernelSolve_np, LazyTensor
from pykeops.torch import KernelSolve
from pykeops.torch.utils import squared_distances

use_cuda = torch.cuda.is_available()


Benchmark specifications:

D  = 3  # Let's do this in 3D
Dv = 1  # Dimension of the vectors (= number of linear problems to solve)
MAXTIME = 10 if use_cuda else 1   # Max number of seconds before we break the loop
REDTIME = 5  if use_cuda else .2  # Decrease the number of runs if computations take longer than 2s...

# Number of samples that we'll loop upon
NS = [10, 20, 50,
100, 200, 500,
1000, 2000, 5000,
10000, 20000, 50000,
100000, 200000, 500000,
1000000
]


Create some random input data:

def generate_samples(N, device, lang):
"""Create point clouds sampled non-uniformly on a sphere of diameter 1."""
if lang == 'torch':
if device == 'cuda':
torch.cuda.manual_seed_all(1234)
else:
torch.manual_seed(1234)

x = torch.rand(N, D, device=device)
b = torch.randn(N, Dv, device=device)
gamma = torch.ones(1, device=device) * .5 / .01 ** 2  # kernel bandwidth
alpha = torch.ones(1, device=device) * 0.8  # regularization
else:
np.random.seed(1234)

x  = np.random.rand(N, D).astype('float32')
b  = np.random.randn(N, Dv).astype('float32')
gamma = (np.ones(1) * 1 / .01 ** 2).astype('float32')   # kernel bandwidth
alpha = (np.ones(1) * 0.8).astype('float32')  # regularization

return x, b, gamma, alpha


## KeOps kernel¶

Define a Gaussian RBF kernel:

formula = 'Exp(- g * SqDist(x,y)) * a'
aliases = ['x = Vi(' + str(D) + ')',   # First arg:  i-variable of size D
'y = Vj(' + str(D) + ')',   # Second arg: j-variable of size D
'a = Vj(' + str(Dv) + ')',  # Third arg:  j-variable of size Dv
'g = Pm(1)']                # Fourth arg: scalar parameter


Note

This operator uses a conjugate gradient solver and assumes that formula defines a symmetric, positive and definite linear reduction with respect to the alias "a" specified trough the third argument.

Define the Kernel solver, with a ridge regularization alpha:

def Kinv_keops(x, b, gamma, alpha):
Kinv = KernelSolve(formula, aliases, "a", axis=1)
res = Kinv(x, x, b, gamma, alpha=alpha)
return res

def Kinv_keops_numpy(x, b, gamma, alpha):
Kinv = KernelSolve_np(formula, aliases, "a", axis=1, dtype='float32')
res = Kinv(x, x, b, gamma, alpha=alpha)
return res

def Kinv_scipy(x, b, gamma, alpha):
x_i, y_j = LazyTensor( gamma * x[:, None, :]), LazyTensor( gamma * x[None, :, :])
K_ij = (- ((x_i - y_j) ** 2).sum(2)).exp()
A = aslinearoperator(diags(alpha * np.ones(x.shape))) +  aslinearoperator(K_ij)
A.dtype = np.dtype('float32')
res = cg(A, b)
return res


Define the same Kernel solver, using a tensorized implementation:

def Kinv_pytorch(x, b, gamma, alpha):
K_xx = alpha * torch.eye(x.shape, device=x.get_device()) + torch.exp( - squared_distances(x, x) * gamma)
res = torch.solve(b, K_xx)
return res

def Kinv_numpy(x, b, gamma, alpha):
K_xx = alpha * np.eye(x.shape) + np.exp( - gamma * np.sum( (x[:,None,:] - x[None,:,:]) **2, axis=2) )
res = np.linalg.solve(K_xx, b)
return res


## Benchmarking loops¶

def benchmark(Routine, dev, N, loops=10, lang='torch') :
"""Times a routine on an N-by-N problem."""

device = torch.device(dev)
x, b, gamma, alpha = generate_samples(N, device, lang)

# We simply benchmark a kernel inversion
code = "a = Routine(x, b, gamma, alpha)"
exec( code, locals() ) # Warmup run, to compile and load everything
if use_cuda: torch.cuda.synchronize()

t_0 = time.perf_counter()  # Actual benchmark --------------------
for i in range(loops):
exec( code, locals() )
if use_cuda: torch.cuda.synchronize()
elapsed = time.perf_counter() - t_0  # ---------------------------

print("{:3} NxN kernel inversion, with N ={:7}: {:3}x{:3.6f}s".format(loops, N, loops, elapsed / loops))
return elapsed / loops

def bench_config(Routine, backend, dev, l) :
"""Times a routine for an increasing number of samples."""

print("Backend : {}, Device : {} -------------".format(backend, dev))

times = []
not_recorded_times = []
try :
Nloops = [100, 10, 1]
nloops = Nloops.pop(0)
for n in NS :
elapsed = benchmark(Routine, dev, n, loops=nloops, lang=l)

times.append( elapsed )
if (nloops * elapsed > MAXTIME) or (nloops * elapsed > REDTIME/nloops and len(Nloops) > 0):
nloops = Nloops.pop(0)

except RuntimeError:
print("**\nMemory overflow !")
not_recorded_times = (len(NS)-len(times)) * [np.nan]
except IndexError:
print("**\nToo slow !")
not_recorded_times = (len(NS)-len(times)) * [np.Infinity]

return times + not_recorded_times

def full_bench(title, routines) :
"""Benchmarks a collection of routines."""

backends = [ backend for (_, backend, _) in routines ]

print("Benchmarking : {} ===============================".format(title))

lines  = [ NS ]
for routine, backend, lang in routines :
lines.append(bench_config(routine, backend, "cuda" if use_cuda else "cpu", lang) )

benches = np.array(lines).T

# Creates a pyplot figure:
plt.figure(figsize=(12,8))
linestyles = ["o-", "s-", "^-", "x-", "<-"]
for i, backend in enumerate(backends):
plt.plot( benches[:,0], benches[:,i+1], linestyles[i],
linewidth=2, label='backend = "{}"'.format(backend) )

for (j, val) in enumerate( benches[:,i+1] ):
if np.isnan(val) and j > 0:
x, y = benches[j-1,0], benches[j-1,i+1]
plt.annotate('Memory overflow!',
xy=(x, 1.05*y),
horizontalalignment='center',
verticalalignment='bottom')
break
elif np.isinf(val) and j > 0:
x, y = benches[j-1,0], benches[j-1,i+1]
plt.annotate('Too slow!',
xy=(x, 1.05*y),
horizontalalignment='center',
verticalalignment='bottom')
break

plt.title('Runtimes for {} in dimension {}'.format(title, D))
plt.xlabel('Number of samples')
plt.ylabel('Seconds')
plt.yscale('log') ; plt.xscale('log')
plt.legend(loc='upper left')
plt.grid(True, which="major", linestyle="-")
plt.grid(True, which="minor", linestyle="dotted")
plt.tight_layout()

# Save as a .csv to put a nice Tikz figure in the papers:
header = "Npoints " + " ".join(backends)
os.makedirs("output", exist_ok=True)
np.savetxt("output/benchmark_kernelsolve.csv", benches,


## Run the benchmark¶

routines = [(Kinv_numpy, "NumPy", "numpy"),
(Kinv_pytorch, "PyTorch", "torch"),
(Kinv_keops_numpy, "NumPy + KeOps", "numpy"),
(Kinv_keops,   "PyTorch + KeOps", "torch"),
(Kinv_scipy,   "Scipy + KeOps", "numpy"),
]
full_bench( "Inverse radial kernel matrix", routines )

plt.show() Out:

Benchmarking : Inverse radial kernel matrix ===============================
Backend : NumPy, Device : cuda -------------
100 NxN kernel inversion, with N =     10: 100x0.000050s
100 NxN kernel inversion, with N =     20: 100x0.000071s
100 NxN kernel inversion, with N =     50: 100x0.000179s
100 NxN kernel inversion, with N =    100: 100x0.001958s
10 NxN kernel inversion, with N =    200:  10x0.010744s
10 NxN kernel inversion, with N =    500:  10x0.016682s
10 NxN kernel inversion, with N =   1000:  10x0.052924s
1 NxN kernel inversion, with N =   2000:   1x0.370490s
1 NxN kernel inversion, with N =   5000:   1x2.982131s
1 NxN kernel inversion, with N =  10000:   1x16.571436s
**
Too slow !
Backend : PyTorch, Device : cuda -------------
100 NxN kernel inversion, with N =     10: 100x0.002620s
10 NxN kernel inversion, with N =     20:  10x0.002559s
10 NxN kernel inversion, with N =     50:  10x0.002584s
10 NxN kernel inversion, with N =    100:  10x0.002650s
10 NxN kernel inversion, with N =    200:  10x0.005435s
10 NxN kernel inversion, with N =    500:  10x0.007049s
10 NxN kernel inversion, with N =   1000:  10x0.008876s
10 NxN kernel inversion, with N =   2000:  10x0.017951s
10 NxN kernel inversion, with N =   5000:  10x0.076712s
1 NxN kernel inversion, with N =  10000:   1x0.287958s
1 NxN kernel inversion, with N =  20000:   1x2.081410s
**
Memory overflow !
Backend : NumPy + KeOps, Device : cuda -------------
100 NxN kernel inversion, with N =     10: 100x0.000180s
100 NxN kernel inversion, with N =     20: 100x0.000178s
100 NxN kernel inversion, with N =     50: 100x0.000177s
100 NxN kernel inversion, with N =    100: 100x0.000291s
100 NxN kernel inversion, with N =    200: 100x0.000372s
100 NxN kernel inversion, with N =    500: 100x0.000728s
10 NxN kernel inversion, with N =   1000:  10x0.000864s
10 NxN kernel inversion, with N =   2000:  10x0.001285s
10 NxN kernel inversion, with N =   5000:  10x0.003022s
10 NxN kernel inversion, with N =  10000:  10x0.005638s
10 NxN kernel inversion, with N =  20000:  10x0.017064s
10 NxN kernel inversion, with N =  50000:  10x0.078482s
1 NxN kernel inversion, with N = 100000:   1x0.303169s
1 NxN kernel inversion, with N = 200000:   1x1.393994s
1 NxN kernel inversion, with N = 500000:   1x10.069412s
**
Too slow !
Backend : PyTorch + KeOps, Device : cuda -------------
100 NxN kernel inversion, with N =     10: 100x0.000571s
10 NxN kernel inversion, with N =     20:  10x0.000571s
10 NxN kernel inversion, with N =     50:  10x0.001168s
10 NxN kernel inversion, with N =    100:  10x0.001173s
10 NxN kernel inversion, with N =    200:  10x0.001795s
10 NxN kernel inversion, with N =    500:  10x0.002172s
10 NxN kernel inversion, with N =   1000:  10x0.002261s
10 NxN kernel inversion, with N =   2000:  10x0.003025s
10 NxN kernel inversion, with N =   5000:  10x0.004729s
10 NxN kernel inversion, with N =  10000:  10x0.007752s
10 NxN kernel inversion, with N =  20000:  10x0.019320s
10 NxN kernel inversion, with N =  50000:  10x0.091227s
1 NxN kernel inversion, with N = 100000:   1x0.369627s
1 NxN kernel inversion, with N = 200000:   1x1.970583s
1 NxN kernel inversion, with N = 500000:   1x61.601444s
**
Too slow !
Backend : Scipy + KeOps, Device : cuda -------------
100 NxN kernel inversion, with N =     10: 100x0.000888s
10 NxN kernel inversion, with N =     20:  10x0.000874s
10 NxN kernel inversion, with N =     50:  10x0.000886s
10 NxN kernel inversion, with N =    100:  10x0.000901s
10 NxN kernel inversion, with N =    200:  10x0.000969s
10 NxN kernel inversion, with N =    500:  10x0.001016s
10 NxN kernel inversion, with N =   1000:  10x0.001100s
10 NxN kernel inversion, with N =   2000:  10x0.001218s
10 NxN kernel inversion, with N =   5000:  10x0.001597s
10 NxN kernel inversion, with N =  10000:  10x0.002188s
10 NxN kernel inversion, with N =  20000:  10x0.004141s
10 NxN kernel inversion, with N =  50000:  10x0.018875s
10 NxN kernel inversion, with N = 100000:  10x0.090040s
1 NxN kernel inversion, with N = 200000:   1x0.725360s
1 NxN kernel inversion, with N = 500000:   1x12.002041s
**
Too slow !
/home/bcharlier/keops/pykeops/benchmarks/plot_benchmark_invkernel.py:258: UserWarning: Matplotlib is currently using agg, which is a non-GUI backend, so cannot show the figure.
plt.show()


Total running time of the script: ( 3 minutes 22.195 seconds)

Gallery generated by Sphinx-Gallery