“That which we need the most will be found where we least want to look.” - Carl Jung.

Cuda implementation for mutiplication to illustrate CUTLASS

1 2#include <iostream> 3#include <cuda_runtime.h> 4 5__global__ void inner_product_kernel(float *a, float *b, float *c, int N) { 6 int index = threadIdx.x + blockIdx.x * blockDim.x; 7 if (index < N) { 8 atomicAdd(c, a[index] * b[index]); 9 } 10} 11 12int main() { 13 int N = 1000000; // Large vector size 14 15 // Host memory allocation 16 float *h_a, *h_b, h_c = 0; 17 h_a = (float*)malloc(N * sizeof(float)); 18 h_b = (float*)malloc(N * sizeof(float)); 19 20 // Initialize vectors with random values 21 for (int i = 0; i < N; i++) { 22 h_a[i] = static_cast<float>(rand()) / RAND_MAX; 23 h_b[i] = static_cast<float>(rand()) / RAND_MAX; 24 } 25 26 // Device memory allocation 27 float *d_a, *d_b, *d_c; 28 cudaMalloc((void**)&d_a, N * sizeof(float)); 29 cudaMalloc((void**)&d_b, N * sizeof(float)); 30 cudaMalloc((void**)&d_c, sizeof(float)); 31 32 // Copy vectors to device 33 cudaMemcpy(d_a, h_a, N * sizeof(float), cudaMemcpyHostToDevice); 34 cudaMemcpy(d_b, h_b, N * sizeof(float), cudaMemcpyHostToDevice); 35 cudaMemcpy(d_c, &h_c, sizeof(float), cudaMemcpyHostToDevice); 36 37 // Kernel configuration 38 int blockSize = 256; 39 int numBlocks = (N + blockSize - 1) / blockSize; 40 41 // Launch kernel 42 inner_product_kernel<<<numBlocks, blockSize>>>(d_a, d_b, d_c, N); 43 44 // Copy result back to host 45 cudaMemcpy(&h_c, d_c, sizeof(float), cudaMemcpyDeviceToHost); 46 47 // Print result 48 std::cout << "Inner product: " << h_c << std::endl; 49 50 // Free memory 51 cudaFree(d_a); 52 cudaFree(d_b); 53 cudaFree(d_c); 54 free(h_a); 55 free(h_b); 56 57 return 0; 58} 59

and

1 2#include <iostream> 3#include <cuda_runtime.h> 4 5#define M 1024 // Rows of A and C 6#define N 1024 // Columns of B and C 7#define K 1024 // Columns of A and rows of B 8 9__global__ void outer_product_kernel(float *A, float *B, float *C) { 10 int row = blockIdx.y * blockDim.y + threadIdx.y; 11 int col = blockIdx.x * blockDim.x + threadIdx.x; 12 13 if (row < M && col < N) { 14 float value = 0; 15 for (int k = 0; k < K; ++k) { 16 value += A[row * K + k] * B[k * N + col]; 17 } 18 C[row * N + col] = value; 19 } 20} 21 22int main() { 23 // Host memory allocation 24 float *h_A, *h_B, *h_C; 25 h_A = (float*)malloc(M * K * sizeof(float)); 26 h_B = (float*)malloc(K * N * sizeof(float)); 27 h_C = (float*)malloc(M * N * sizeof(float)); 28 29 // Initialize matrices A and B with random values, and C to zero 30 for (int i = 0; i < M * K; i++) { 31 h_A[i] = static_cast<float>(rand()) / RAND_MAX; 32 } 33 for (int i = 0; i < K * N; i++) { 34 h_B[i] = static_cast<float>(rand()) / RAND_MAX; 35 } 36 for (int i = 0; i < M * N; i++) { 37 h_C[i] = 0.0f; // initialize C to zero 38 } 39 40 // Device memory allocation 41 float *d_A, *d_B, *d_C; 42 cudaMalloc((void**)&d_A, M * K * sizeof(float)); 43 cudaMalloc((void**)&d_B, K * N * sizeof(float)); 44 cudaMalloc((void**)&d_C, M * N * sizeof(float)); 45 46 // Copy matrices to device 47 cudaMemcpy(d_A, h_A, M * K * sizeof(float), cudaMemcpyHostToDevice); 48 cudaMemcpy(d_B, h_B, K * N * sizeof(float), cudaMemcpyHostToDevice); 49 cudaMemcpy(d_C, h_C, M * N * sizeof(float), cudaMemcpyHostToDevice); 50 51 // Kernel configuration 52 dim3 threadsPerBlock(16, 16); 53 dim3 numBlocks((N + threadsPerBlock.x - 1) / threadsPerBlock.x, 54 (M + threadsPerBlock.y - 1) / threadsPerBlock.y); 55 56 // Launch kernel 57 outer_product_kernel<<<numBlocks, threadsPerBlock>>>(d_A, d_B, d_C); 58 59 // Copy result back to host 60 cudaMemcpy(h_C, d_C, M * N * sizeof(float), cudaMemcpyDeviceToHost); 61 62 // Print part of the result to verify (e.g., the first element) 63 std::cout << "Outer product result (C[0][0]): " << h_C[0] << std::endl; 64 65 // Free memory 66 cudaFree(d_A); 67 cudaFree(d_B); 68 cudaFree(d_C); 69 free(h_A); 70 free(h_B); 71 free(h_C); 72 73 return 0; 74}