-
Notifications
You must be signed in to change notification settings - Fork 4
Documentation
Leonardo Xavier Kuffo Rivero edited this page Mar 30, 2026
·
3 revisions
Below, we provide a minimal example to run Super K-Means clustering in C++.
#include <cstddef>
#include <cstdlib>
#include <iostream>
#include <vector>
#include "superkmeans/pdx/utils.h"
#include "superkmeans/superkmeans.h"
int main(int argc, char* argv[]) {
size_t n = 1000000; // Number of embeddings
size_t d = 768; // Dimensionality of the embeddings
size_t k = 1000; // Number of clusters to create
std::cout << "Generating " << n << " vectors with d=" << d << std::endl;
std::vector<float> data = skmeans::MakeBlobs(n, d, 100, true);
auto kmeans = skmeans::SuperKMeans(k, d);
std::cout << "Running SuperKMeans with " << k << " clusters..." << std::endl;
std::vector<float> centroids = kmeans.Train(data.data(), n);
// Get assignments
std::cout << "Getting final assignments..." << std::endl;
std::vector<uint32_t> assignments = kmeans.AssignTrainingPoints(data.data(), centroids.data(), n, k);
}This variant is extremely fast. We recommend it if you have more than 100K data points.
#include <cstddef>
#include <cstdlib>
#include <iostream>
#include <vector>
#include "superkmeans/pdx/utils.h"
#include "superkmeans/hierarchical_superkmeans.h"
int main(int argc, char* argv[]) {
size_t n = 1000000;
size_t d = 768;
size_t k = 1000;
std::cout << "Generating " << n << " vectors with d=" << d << std::endl;
std::vector<float> data = skmeans::MakeBlobs(n, d, 100, true);
auto kmeans = skmeans::HierarchicalSuperKMeans(k, d);
std::cout << "Running SuperKMeans with " << k << " clusters..." << std::endl;
std::vector<float> centroids = kmeans.Train(data.data(), n);
// Get assignments
std::cout << "Getting final assignments..." << std::endl;
std::vector<uint32_t> assignments = kmeans.AssignTrainingPoints(data.data(), centroids.data(), n, k);
}We provide a plethora of parameters for the customization of your clustering via a config object.
skmeans::SuperKMeansConfig config;
config.angular = false // Use spherical k-means (default=False)
config.iters = 10 // Number of iterations in the core loop (default=10)
config.sampling_fraction = 0.3 // Fraction of points to sample (default=0.3)
config.n_threads = 32 // Number of threads to use (default=0, uses max available)
config.seed = 42 // Random seed for reproducibility
config.early_termination = true // Whether to enable early termination mechanisms (default=True)
config.tol = 1e-4f // Tolerance for WCSS improvement rate and centroids shift before stopping (default=1e-4)
config.verbose = false # Prints debug logs
config.data_already_rotated = false // If True, assumes input data is already rotated (default=False)
config.use_blas_only = false // If True, disables SuperKMeans PRUNING phase (default=False)
// The constructor accepts the `config` as a third parameter
auto kmeans = skmeans::SuperKMeans(k, d, config);skmeans::HierarchicalSuperKMeansConfig config;
// Same parameters as vanilla k-means, plus:
config.iters_mesoclustering = 3; // Iterations for the mesoclustering phase
config.iters_fineclustering = 5; // Iterations for the fineclustering phase
// Additional vanilla k-means iterations to refine the clusters resulting from hierarchical k-means
// In practice, these are very expensive and do not lead to substantial gains (default = 0)
config.iters_refinement = 1;
// The constructor accepts the `config` as a third parameter
auto kmeans = skmeans::HierarchicalSuperKMeans(k, d, config);