Files
supercomputers/src/gpu_plugin.cu
2025-12-13 12:45:29 +00:00

263 lines
8.7 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#include <cuda_runtime.h>
#include <cstdint>
#include <cfloat>
#include <cstdio>
#include <ctime>
// CPU таймер в миллисекундах
static double get_time_ms() {
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC, &ts);
return ts.tv_sec * 1000.0 + ts.tv_nsec / 1000000.0;
}
// Структуры данных (должны совпадать с C++ кодом)
struct GpuRecord {
double timestamp;
double open;
double high;
double low;
double close;
double volume;
};
struct GpuDayStats {
long long day;
double avg;
double open_min;
double open_max;
double close_min;
double close_max;
long long count;
};
extern "C" int gpu_is_available() {
int n = 0;
cudaError_t err = cudaGetDeviceCount(&n);
if (err != cudaSuccess) return 0;
if (n > 0) {
// Инициализируем CUDA контекст заранее (cudaFree(0) форсирует инициализацию)
cudaFree(0);
}
return (n > 0) ? 1 : 0;
}
// Kernel для агрегации (каждый поток обрабатывает один день)
__global__ void aggregate_kernel(
const GpuRecord* records,
int num_records,
const int* day_offsets, // начало каждого дня в массиве records
const int* day_counts, // количество записей в каждом дне
const long long* day_indices, // индексы дней
int num_days,
GpuDayStats* out_stats)
{
// Глобальный индекс потока = индекс дня
int d = blockIdx.x * blockDim.x + threadIdx.x;
if (d >= num_days) return;
int offset = day_offsets[d];
int count = day_counts[d];
GpuDayStats stats;
stats.day = day_indices[d];
stats.open_min = DBL_MAX;
stats.open_max = -DBL_MAX;
stats.close_min = DBL_MAX;
stats.close_max = -DBL_MAX;
stats.count = count;
double avg_sum = 0.0;
for (int i = 0; i < count; i++) {
const GpuRecord& r = records[offset + i];
// Accumulate avg = (low + high) / 2
avg_sum += (r.low + r.high) / 2.0;
// min/max Open
if (r.open < stats.open_min) stats.open_min = r.open;
if (r.open > stats.open_max) stats.open_max = r.open;
// min/max Close
if (r.close < stats.close_min) stats.close_min = r.close;
if (r.close > stats.close_max) stats.close_max = r.close;
}
stats.avg = avg_sum / static_cast<double>(count);
out_stats[d] = stats;
}
// Функция агрегации, вызываемая из C++
extern "C" int gpu_aggregate_days(
const GpuRecord* h_records,
int num_records,
const int* h_day_offsets,
const int* h_day_counts,
const long long* h_day_indices,
int num_days,
GpuDayStats* h_out_stats)
{
double cpu_total_start = get_time_ms();
// === Создаём CUDA события для измерения времени ===
double cpu_event_create_start = get_time_ms();
cudaEvent_t start_malloc, stop_malloc;
cudaEvent_t start_transfer, stop_transfer;
cudaEvent_t start_kernel, stop_kernel;
cudaEvent_t start_copy_back, stop_copy_back;
cudaEvent_t start_free, stop_free;
cudaEventCreate(&start_malloc);
cudaEventCreate(&stop_malloc);
cudaEventCreate(&start_transfer);
cudaEventCreate(&stop_transfer);
cudaEventCreate(&start_kernel);
cudaEventCreate(&stop_kernel);
cudaEventCreate(&start_copy_back);
cudaEventCreate(&stop_copy_back);
cudaEventCreate(&start_free);
cudaEventCreate(&stop_free);
double cpu_event_create_ms = get_time_ms() - cpu_event_create_start;
// === ИЗМЕРЕНИЕ cudaMalloc ===
cudaEventRecord(start_malloc);
GpuRecord* d_records = nullptr;
int* d_day_offsets = nullptr;
int* d_day_counts = nullptr;
long long* d_day_indices = nullptr;
GpuDayStats* d_out_stats = nullptr;
cudaError_t err;
err = cudaMalloc(&d_records, num_records * sizeof(GpuRecord));
if (err != cudaSuccess) return -1;
err = cudaMalloc(&d_day_offsets, num_days * sizeof(int));
if (err != cudaSuccess) { cudaFree(d_records); return -2; }
err = cudaMalloc(&d_day_counts, num_days * sizeof(int));
if (err != cudaSuccess) { cudaFree(d_records); cudaFree(d_day_offsets); return -3; }
err = cudaMalloc(&d_day_indices, num_days * sizeof(long long));
if (err != cudaSuccess) { cudaFree(d_records); cudaFree(d_day_offsets); cudaFree(d_day_counts); return -4; }
err = cudaMalloc(&d_out_stats, num_days * sizeof(GpuDayStats));
if (err != cudaSuccess) { cudaFree(d_records); cudaFree(d_day_offsets); cudaFree(d_day_counts); cudaFree(d_day_indices); return -5; }
cudaEventRecord(stop_malloc);
cudaEventSynchronize(stop_malloc);
float time_malloc_ms = 0;
cudaEventElapsedTime(&time_malloc_ms, start_malloc, stop_malloc);
// === ИЗМЕРЕНИЕ memcpy H->D ===
cudaEventRecord(start_transfer);
err = cudaMemcpy(d_records, h_records, num_records * sizeof(GpuRecord), cudaMemcpyHostToDevice);
if (err != cudaSuccess) return -10;
err = cudaMemcpy(d_day_offsets, h_day_offsets, num_days * sizeof(int), cudaMemcpyHostToDevice);
if (err != cudaSuccess) return -11;
err = cudaMemcpy(d_day_counts, h_day_counts, num_days * sizeof(int), cudaMemcpyHostToDevice);
if (err != cudaSuccess) return -12;
err = cudaMemcpy(d_day_indices, h_day_indices, num_days * sizeof(long long), cudaMemcpyHostToDevice);
if (err != cudaSuccess) return -13;
cudaEventRecord(stop_transfer);
cudaEventSynchronize(stop_transfer);
float time_transfer_ms = 0;
cudaEventElapsedTime(&time_transfer_ms, start_transfer, stop_transfer);
// === ИЗМЕРЕНИЕ kernel ===
const int THREADS_PER_BLOCK = 256;
int num_blocks = (num_days + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
cudaEventRecord(start_kernel);
aggregate_kernel<<<num_blocks, THREADS_PER_BLOCK>>>(
d_records, num_records,
d_day_offsets, d_day_counts, d_day_indices,
num_days, d_out_stats
);
err = cudaGetLastError();
if (err != cudaSuccess) {
cudaFree(d_records);
cudaFree(d_day_offsets);
cudaFree(d_day_counts);
cudaFree(d_day_indices);
cudaFree(d_out_stats);
return -7;
}
cudaEventRecord(stop_kernel);
cudaEventSynchronize(stop_kernel);
float time_kernel_ms = 0;
cudaEventElapsedTime(&time_kernel_ms, start_kernel, stop_kernel);
// === ИЗМЕРЕНИЕ memcpy D->H ===
cudaEventRecord(start_copy_back);
cudaMemcpy(h_out_stats, d_out_stats, num_days * sizeof(GpuDayStats), cudaMemcpyDeviceToHost);
cudaEventRecord(stop_copy_back);
cudaEventSynchronize(stop_copy_back);
float time_copy_back_ms = 0;
cudaEventElapsedTime(&time_copy_back_ms, start_copy_back, stop_copy_back);
// === ИЗМЕРЕНИЕ cudaFree ===
cudaEventRecord(start_free);
cudaFree(d_records);
cudaFree(d_day_offsets);
cudaFree(d_day_counts);
cudaFree(d_day_indices);
cudaFree(d_out_stats);
cudaEventRecord(stop_free);
cudaEventSynchronize(stop_free);
float time_free_ms = 0;
cudaEventElapsedTime(&time_free_ms, start_free, stop_free);
// Общее время GPU
float time_total_ms = time_malloc_ms + time_transfer_ms + time_kernel_ms + time_copy_back_ms + time_free_ms;
// === Освобождаем события ===
double cpu_event_destroy_start = get_time_ms();
cudaEventDestroy(start_malloc);
cudaEventDestroy(stop_malloc);
cudaEventDestroy(start_transfer);
cudaEventDestroy(stop_transfer);
cudaEventDestroy(start_kernel);
cudaEventDestroy(stop_kernel);
cudaEventDestroy(start_copy_back);
cudaEventDestroy(stop_copy_back);
cudaEventDestroy(start_free);
cudaEventDestroy(stop_free);
double cpu_event_destroy_ms = get_time_ms() - cpu_event_destroy_start;
double cpu_total_ms = get_time_ms() - cpu_total_start;
// Выводим детальную статистику
printf(" GPU Timings (%d records, %d days):\n", num_records, num_days);
printf(" cudaMalloc: %7.3f ms\n", time_malloc_ms);
printf(" memcpy H->D: %7.3f ms\n", time_transfer_ms);
printf(" kernel execution: %7.3f ms\n", time_kernel_ms);
printf(" memcpy D->H: %7.3f ms\n", time_copy_back_ms);
printf(" cudaFree: %7.3f ms\n", time_free_ms);
printf(" GPU TOTAL: %7.3f ms\n", cpu_total_ms);
fflush(stdout);
return 0;
}