Datasets for the benchmarks

import numpy as np

synthetic = {
    # Key   :  metric,    Ntrain, Ntest, D,
    "R^D a": ("euclidean", 10 ** 4, 10 ** 4, 3),
    "R^D b": ("euclidean", 10 ** 6, 10 ** 4, 3),
    "R^D c": ("euclidean", 10 ** 6, 10 ** 4, 10),
    "R^D d": ("euclidean", 10 ** 6, 10 ** 4, 100),
    "R^D e": ("euclidean", 10 ** 7, 10 ** 4, 100),
    "R^D f": ("manhattan", 10 ** 6, 10 ** 4, 10),
    "R^D g": ("manhattan", 10 ** 6, 10 ** 4, 100),
    "S^{D-1}": ("angular", 10 ** 6, 10 ** 4, 10),
    "H^D": ("hyperbolic", 10 ** 6, 10 ** 4, 10),
}

downloaded = {
    # Key     :    metric, filename, do_check
    "MNIST a": ("euclidean", "mnist-784-euclidean.hdf5", True),
    "MNIST b": ("manhattan", "mnist-784-euclidean.hdf5", False),
    "GloVe25": ("angular", "glove-25-angular.hdf5", True),
    "GloVe100": ("angular", "glove-100-angular.hdf5", True),
}


def get_dataset(key):

    data_folder = ""  #'../data/'

    filename = None

    true_indices = None
    true_values = None

    if key in synthetic.keys():
        metric, Ntrain, Ntest, D = synthetic[key]

        if metric == "hyperbolic":
            x_train = 0.5 + np.random.rand(Ntrain, D)
            x_test = 0.5 + np.random.rand(Ntest, D)
        else:
            x_train = np.random.randn(Ntrain, D)
            x_test = np.random.randn(Ntest, D)

    else:
        metric, filename, do_check = downloaded[key]

        import h5py

        f = h5py.File(data_folder + filename, "r")
        x_train = f["train"][()]
        x_test = f["test"][()]
        true_indices = f["neighbors"][()]
        true_values = f["distances"][()]

    #  With the angular metric, all the points are normalized:
    if metric == "angular":
        x_train /= np.linalg.norm(x_train, axis=1, keepdims=True)
        x_test /= np.linalg.norm(x_test, axis=1, keepdims=True)

    x_train = x_train.astype("float32")
    x_test = x_test.astype("float32")

    return {
        "train": x_train,
        "test": x_test,
        "metric": metric,
        "output": true_indices,
        # "true_distances": true_values,
    }


from benchmark_utils import tensor
from pykeops.torch import LazyTensor


def ground_truth(x_train, x_test, K, metric):
    # Setup the K-NN estimator:
    x_train = tensor(x_train)
    x_test = tensor(x_test)

    # Encoding as KeOps LazyTensors:
    X_i = LazyTensor(x_test[:, None, :])
    X_j = LazyTensor(x_train[None, :, :])

    # Symbolic distance matrix:
    if metric == "euclidean":
        D_ij = ((X_i - X_j) ** 2).sum(-1)
    elif metric == "manhattan":
        D_ij = (X_i - X_j).abs().sum(-1)
    elif metric == "angular":
        D_ij = -(X_i | X_j)
    elif metric == "hyperbolic":
        D_ij = ((X_i - X_j) ** 2).sum(-1) / (X_i[0] * X_j[0])

    # K-NN query:
    indices = D_ij.argKmin(K, dim=1)
    return indices.cpu().numpy()


from copy import deepcopy


def generate_samples(key):

    dataset = get_dataset(key)

    def samples(K):
        KNN_dataset = deepcopy(dataset)
        KNN_dataset["output"] = ground_truth(
            KNN_dataset["train"], KNN_dataset["test"], K, KNN_dataset["metric"]
        )
        return KNN_dataset

    return samples

Total running time of the script: ( 0 minutes 0.000 seconds)

Gallery generated by Sphinx-Gallery