Gradient of Radial kernels convolutions

This benchmark compares the performances of KeOps versus Numpy and Torch on various gradient of radial kernels convolutions. Namely it computes:

\[c_i^u = \sum_{j=1}^N \partial_{x_i^u} f\Big(\frac{\|x_i-y_j\|}{\sigma}\Big) \langle b_j, a_i \rangle, \quad \text{ for all } i=1,\cdots,M, \, u=1,\cdots,D\]

where \(f\) is a Gauss or Cauchy or Laplace or inverse multiquadric kernel. See e.g. wikipedia

Setup

Standard imports:

import numpy as np
import timeit
import matplotlib
from matplotlib import pyplot as plt
from pykeops.numpy.utils import grad_np_kernel, chain_rules

Benchmark specifications:

M = 10000  # Number of points x_i
N = 10000  # Number of points y_j
D = 3      # Dimension of the x_i's and y_j's
E = 3      # Dimension of the b_j's
REPEAT = 10  # Number of loops per test

use_numpy = True
use_vanilla = True

dtype = 'float32'

Create some random input data:

a = np.random.rand(N, E).astype(dtype)  # Gradient to backprop
x = np.random.rand(N, D).astype(dtype)  # Target points
y = np.random.rand(M, D).astype(dtype)  # Source points
b = np.random.rand(M, E).astype(dtype)  # Source signals
sigma = np.array([0.4]).astype(dtype)   # Kernel radius

And load it as PyTorch variables:

try:
    import torch

    use_cuda = torch.cuda.is_available()
    device = 'cuda' if use_cuda else 'cpu'
    torchtype = torch.float32 if dtype == 'float32' else torch.float64

    ac = torch.tensor(a, dtype=torchtype, device=device)
    xc = torch.tensor(x, dtype=torchtype, device=device, requires_grad=True)
    yc = torch.tensor(y, dtype=torchtype, device=device)
    bc = torch.tensor(b, dtype=torchtype, device=device)
    sigmac = torch.tensor(sigma, dtype=torchtype, device=device)

except:
    pass

Convolution Gradient Benchmarks

We loop over four different kernels:

kernel_to_test = ['gaussian', 'laplacian', 'cauchy', 'inverse_multiquadric']

With four backends: Numpy, vanilla PyTorch, Generic KeOps reductions and a specific, handmade legacy CUDA code for kernel convolution gradients:

speed_numpy = {i:np.nan for i in kernel_to_test}
speed_pykeops = {i:np.nan for i in kernel_to_test}
speed_pytorch = {i:np.nan for i in kernel_to_test}
speed_pykeops_specific = {i:np.nan for i in kernel_to_test}

print('Timings for {}x{} convolution gradients:'.format(M, N))

for k in kernel_to_test:
    print('kernel: ' + k)

    # Pure numpy
    if use_numpy :
        gnumpy = chain_rules(a, x, y, grad_np_kernel(x, y, sigma, kernel=k), b)
        speed_numpy[k] = timeit.repeat('gnumpy = chain_rules(a, x, y, grad_np_kernel(x, y, sigma, kernel=k), b)',
            globals=globals(), repeat=3, number=1)
        print('Time for NumPy:               {:.4f}s'.format(np.median(speed_numpy[k])))
    else :
        gnumpy = torch.zeros_like(xc).data.cpu().numpy()


    # Vanilla pytorch (with cuda if available, and cpu otherwise)
    if use_vanilla :
        try:
            from pykeops.torch import Kernel, kernel_product

            params = {
                'id': Kernel(k + '(x,y)'),
                'gamma': 1. / (sigmac**2),
                'backend': 'pytorch',
            }

            aKxy_b = torch.dot(ac.view(-1), kernel_product(params, xc, yc, bc, mode='sum').view(-1))
            g3 = torch.autograd.grad(aKxy_b, xc, create_graph=False)[0].cpu()
            torch.cuda.synchronize()
            speed_pytorch[k] =  np.array(timeit.repeat(
                setup = "cost = torch.dot(ac.view(-1), kernel_product(params, xc, yc, bc, mode='sum').view(-1))",
                stmt  = "g3 = torch.autograd.grad(cost, xc, create_graph=False)[0] ; torch.cuda.synchronize()",
                globals=globals(), repeat=REPEAT, number=1))
            print('Time for PyTorch:             {:.4f}s'.format(np.median(speed_pytorch[k])), end='')
            print('   (absolute error:       ', np.max(np.abs(g3.data.numpy() - gnumpy)), ')')
        except:
            print('Time for PyTorch:             Not Done')



    # Keops: generic tiled implementation (with cuda if available, and cpu otherwise)
    try:
        from pykeops.torch import Kernel, kernel_product

        params = {
            'id': Kernel(k + '(x,y)'),
            'gamma': 1. / (sigmac**2),
            'backend': 'auto',
        }

        aKxy_b = torch.dot(ac.view(-1), kernel_product(params, xc, yc, bc, mode='sum').view(-1))
        g3 = torch.autograd.grad(aKxy_b, xc, create_graph=False)[0].cpu()
        torch.cuda.synchronize()
        speed_pykeops[k] =  np.array(timeit.repeat(
            setup = "cost = torch.dot(ac.view(-1), kernel_product(params, xc, yc, bc, mode='sum').view(-1))",
            stmt  = "g3 = torch.autograd.grad(cost, xc, create_graph=False)[0] ; torch.cuda.synchronize()",
            globals=globals(), repeat=REPEAT, number=1))
        print('Time for KeOps generic:       {:.4f}s'.format(np.median(speed_pykeops[k])), end='')
        print('   (absolute error:       ', np.max(np.abs(g3.data.numpy() - gnumpy)), ')')
    except:
        print('Time for KeOps generic:       Not Done')


    # Specific cuda tiled implementation (if cuda is available)
    try:
        from pykeops.numpy import RadialKernelGrad1conv
        my_conv = RadialKernelGrad1conv(dtype)
        g1 = my_conv(a, x, y, b, sigma, kernel=k)
        torch.cuda.synchronize()
        speed_pykeops_specific[k] =  np.array(timeit.repeat(
            'g1 = my_conv(a, x, y, b, sigma, kernel=k)',
            globals=globals(), repeat=REPEAT, number=1))
        print('Time for KeOps cuda specific: {:.4f}s'.format(np.median(speed_pykeops_specific[k])), end='')
        print('   (absolute error:       ', np.max(np.abs (g1 - gnumpy)), ')')
    except:
        print('Time for KeOps cuda specific: Not Done')

Out:

Timings for 10000x10000 convolution gradients:
kernel: gaussian
Time for NumPy:               6.1470s
Time for PyTorch:             0.0228s   (absolute error:        0.004638672 )
Time for KeOps generic:       0.0020s   (absolute error:        0.021484375 )
Compiling radial_kernel_grad1conv using float32... Done.
Time for KeOps cuda specific: 0.0009s   (absolute error:        0.021484375 )
kernel: laplacian
Time for NumPy:               5.4255s
Time for PyTorch:             0.0265s   (absolute error:        0.32751465 )
Compiling libKeOpstorche1b9334428 in /home/bcharlier/tmp/libkeops/pykeops/common/../build//build-libKeOpstorche1b9334428:
       formula: Grad_WithSavedForward(Sum_Reduction((Exp(-Sqrt( WeightedSqDist(G_0,X_0,Y_0))) * B_0),0), Var(1,3,0), Var(4,3,0), Var(5,3,0))
       aliases: G_0 = Pm(0,1); X_0 = Vi(1,3); Y_0 = Vj(2,3); B_0 = Vj(3,3); Var(4,3,0); Var(5,3,0);
       dtype  : float32
... Done.
Time for KeOps generic:       0.0026s   (absolute error:        0.32226562 )
Time for KeOps cuda specific: 0.0012s   (absolute error:        0.3223877 )
kernel: cauchy
Time for NumPy:               5.7923s
Time for PyTorch:             0.0265s   (absolute error:        0.0031738281 )
Compiling libKeOpstorch1052fd446b in /home/bcharlier/tmp/libkeops/pykeops/common/../build//build-libKeOpstorch1052fd446b:
       formula: Grad_WithSavedForward(Sum_Reduction((Inv( IntCst(1) + WeightedSqDist(G_0,X_0,Y_0)) * B_0),0), Var(1,3,0), Var(4,3,0), Var(5,3,0))
       aliases: G_0 = Pm(0,1); X_0 = Vi(1,3); Y_0 = Vj(2,3); B_0 = Vj(3,3); Var(4,3,0); Var(5,3,0);
       dtype  : float32
... Done.
Time for KeOps generic:       0.0023s   (absolute error:        0.022949219 )
Time for KeOps cuda specific: 0.0011s   (absolute error:        0.022949219 )
kernel: inverse_multiquadric
Time for NumPy:               6.8716s
Time for PyTorch:             0.0242s   (absolute error:        0.001953125 )
Compiling libKeOpstorch7ab9ba97cc in /home/bcharlier/tmp/libkeops/pykeops/common/../build//build-libKeOpstorch7ab9ba97cc:
       formula: Grad_WithSavedForward(Sum_Reduction((Inv(Sqrt(IntCst(1) + WeightedSqDist(G_0,X_0,Y_0))) * B_0),0), Var(1,3,0), Var(4,3,0), Var(5,3,0))
       aliases: G_0 = Pm(0,1); X_0 = Vi(1,3); Y_0 = Vj(2,3); B_0 = Vj(3,3); Var(4,3,0); Var(5,3,0);
       dtype  : float32
... Done.
Time for KeOps generic:       0.0024s   (absolute error:        0.020019531 )
Time for KeOps cuda specific: 0.0012s   (absolute error:        0.020019531 )

Display results

# plot violin plot
plt.violinplot(list(speed_numpy.values()),
               showmeans=False,
               showmedians=True,
               )
plt.violinplot(list(speed_pytorch.values()),
               showmeans=False,
               showmedians=True,
               )
plt.violinplot(list(speed_pykeops.values()),
               showmeans=False,
               showmedians=True,
               )
plt.violinplot(list(speed_pykeops_specific.values()),
               showmeans=False,
               showmedians=True,
               )

plt.xticks([1, 2, 3, 4], kernel_to_test)
plt.yscale('log')
# plt.ylim((0, .01))

plt.grid(True)
plt.xlabel('kernel type')
plt.ylabel('time in s.')

cmap = plt.get_cmap("tab10")
fake_handles = [matplotlib.patches.Patch(color=cmap(i)) for i in range(4)]

plt.legend(fake_handles, ['NumPy', 'PyTorch', 'KeOps', 'KeOps specific'], loc='best')

plt.show()
../_images/sphx_glr_plot_benchmark_grad1convolutions_001.png

Out:

/home/bcharlier/tmp/libkeops/pykeops/benchmarks/plot_benchmark_grad1convolutions.py:210: UserWarning: Matplotlib is currently using agg, which is a non-GUI backend, so cannot show the figure.
  plt.show()

Total running time of the script: ( 2 minutes 50.818 seconds)

Gallery generated by Sphinx-Gallery