From 4be0c03f018337ab0b4c4cd48ddd940fac72f229 Mon Sep 17 00:00:00 2001
From: cflip <cflip@cflip.net>
Date: Sun, 11 Dec 2022 19:50:23 -0700
Subject: Add an experimental CUDA program to find the hash

Since the CPU can only do things so fast, I'd like to use GPU
acceleration to speed up the process. This simple CUDA program is able
to find hashes up to about 5 characters long, but doesn't seem to work
after that.

Eventually I'd like to integrate this with the main program instead of
compiling a separate executable.
---
 hashstrings.cu | 69 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 69 insertions(+)
 create mode 100644 hashstrings.cu

diff --git a/hashstrings.cu b/hashstrings.cu
new file mode 100644
index 0000000..6c72170
--- /dev/null
+++ b/hashstrings.cu
@@ -0,0 +1,69 @@
+#include <stdio.h>
+#include <math.h>
+
+#define STRING_LENGTH 4
+
+const int n = 26;
+const int num_elements = pow(n, STRING_LENGTH);
+const int threads_per_block = 256;
+
+__device__ long compute_hash_code(const char* str, int n)
+{
+	long result = 0;
+	for (int i = 0; i < n; i++)
+		result = 31 * result + str[i];
+	return result;
+}
+
+__global__ void find_from_generated_strings(long* hash_to_find, char* result_str,
+		int total_elements)
+{
+	char str[STRING_LENGTH + 1];
+
+	int i = threadIdx.x + blockIdx.x * blockDim.x;
+	if (i >= total_elements)
+		return;
+
+	for (int j = 0; j < STRING_LENGTH; j++) {
+		int string_index = STRING_LENGTH - (j + 1);
+		int char_index = (i / (int)pow(n, j)) % n;
+		str[string_index] = 'a' + char_index;
+	}
+
+	if (compute_hash_code(str, STRING_LENGTH) == *hash_to_find) {
+		memcpy(result_str, str, STRING_LENGTH);
+		return;
+	}
+}
+
+int main()
+{
+	long hash_to_find = 3446974;
+	char result[STRING_LENGTH + 1];
+
+	long* to_find_on_device;
+	char* result_string_on_device;
+
+	cudaMalloc(&to_find_on_device, sizeof(long));
+	cudaMalloc(&result_string_on_device, STRING_LENGTH + 1);
+
+	cudaMemcpy(to_find_on_device, &hash_to_find, sizeof(int),
+			cudaMemcpyHostToDevice);
+
+	const int num_blocks = (num_elements + threads_per_block - 1) /
+		threads_per_block;
+	printf("We have %d items to compute\n", num_elements);
+	printf("Running with %d blocks and %d threads per block\n", num_blocks,
+			threads_per_block);
+	find_from_generated_strings<<<num_blocks, threads_per_block>>>(to_find_on_device,
+			result_string_on_device, num_elements);
+
+	cudaMemcpy(result, result_string_on_device, STRING_LENGTH + 1, cudaMemcpyDeviceToHost);
+
+	printf("%s\n", result);
+
+	cudaFree(to_find_on_device);
+	cudaFree(result_string_on_device);
+
+	return 0;
+}
-- 
cgit v1.2.3