263 lines
8.7 KiB
Plaintext
263 lines
8.7 KiB
Plaintext
#include <cuda_runtime.h>
|
||
#include <cstdint>
|
||
#include <cfloat>
|
||
#include <cstdio>
|
||
#include <ctime>
|
||
|
||
// CPU таймер в миллисекундах
|
||
static double get_time_ms() {
|
||
struct timespec ts;
|
||
clock_gettime(CLOCK_MONOTONIC, &ts);
|
||
return ts.tv_sec * 1000.0 + ts.tv_nsec / 1000000.0;
|
||
}
|
||
|
||
// Структуры данных (должны совпадать с C++ кодом)
|
||
struct GpuRecord {
|
||
double timestamp;
|
||
double open;
|
||
double high;
|
||
double low;
|
||
double close;
|
||
double volume;
|
||
};
|
||
|
||
struct GpuDayStats {
|
||
long long day;
|
||
double avg;
|
||
double open_min;
|
||
double open_max;
|
||
double close_min;
|
||
double close_max;
|
||
long long count;
|
||
};
|
||
|
||
extern "C" int gpu_is_available() {
|
||
int n = 0;
|
||
cudaError_t err = cudaGetDeviceCount(&n);
|
||
if (err != cudaSuccess) return 0;
|
||
if (n > 0) {
|
||
// Инициализируем CUDA контекст заранее (cudaFree(0) форсирует инициализацию)
|
||
cudaFree(0);
|
||
}
|
||
return (n > 0) ? 1 : 0;
|
||
}
|
||
|
||
// Kernel для агрегации (каждый поток обрабатывает один день)
|
||
__global__ void aggregate_kernel(
|
||
const GpuRecord* records,
|
||
int num_records,
|
||
const int* day_offsets, // начало каждого дня в массиве records
|
||
const int* day_counts, // количество записей в каждом дне
|
||
const long long* day_indices, // индексы дней
|
||
int num_days,
|
||
GpuDayStats* out_stats)
|
||
{
|
||
// Глобальный индекс потока = индекс дня
|
||
int d = blockIdx.x * blockDim.x + threadIdx.x;
|
||
|
||
if (d >= num_days) return;
|
||
|
||
int offset = day_offsets[d];
|
||
int count = day_counts[d];
|
||
|
||
GpuDayStats stats;
|
||
stats.day = day_indices[d];
|
||
stats.open_min = DBL_MAX;
|
||
stats.open_max = -DBL_MAX;
|
||
stats.close_min = DBL_MAX;
|
||
stats.close_max = -DBL_MAX;
|
||
stats.count = count;
|
||
|
||
double avg_sum = 0.0;
|
||
|
||
for (int i = 0; i < count; i++) {
|
||
const GpuRecord& r = records[offset + i];
|
||
|
||
// Accumulate avg = (low + high) / 2
|
||
avg_sum += (r.low + r.high) / 2.0;
|
||
|
||
// min/max Open
|
||
if (r.open < stats.open_min) stats.open_min = r.open;
|
||
if (r.open > stats.open_max) stats.open_max = r.open;
|
||
|
||
// min/max Close
|
||
if (r.close < stats.close_min) stats.close_min = r.close;
|
||
if (r.close > stats.close_max) stats.close_max = r.close;
|
||
}
|
||
|
||
stats.avg = avg_sum / static_cast<double>(count);
|
||
out_stats[d] = stats;
|
||
}
|
||
|
||
// Функция агрегации, вызываемая из C++
|
||
extern "C" int gpu_aggregate_days(
|
||
const GpuRecord* h_records,
|
||
int num_records,
|
||
const int* h_day_offsets,
|
||
const int* h_day_counts,
|
||
const long long* h_day_indices,
|
||
int num_days,
|
||
GpuDayStats* h_out_stats)
|
||
{
|
||
double cpu_total_start = get_time_ms();
|
||
|
||
// === Создаём CUDA события для измерения времени ===
|
||
double cpu_event_create_start = get_time_ms();
|
||
|
||
cudaEvent_t start_malloc, stop_malloc;
|
||
cudaEvent_t start_transfer, stop_transfer;
|
||
cudaEvent_t start_kernel, stop_kernel;
|
||
cudaEvent_t start_copy_back, stop_copy_back;
|
||
cudaEvent_t start_free, stop_free;
|
||
|
||
cudaEventCreate(&start_malloc);
|
||
cudaEventCreate(&stop_malloc);
|
||
cudaEventCreate(&start_transfer);
|
||
cudaEventCreate(&stop_transfer);
|
||
cudaEventCreate(&start_kernel);
|
||
cudaEventCreate(&stop_kernel);
|
||
cudaEventCreate(&start_copy_back);
|
||
cudaEventCreate(&stop_copy_back);
|
||
cudaEventCreate(&start_free);
|
||
cudaEventCreate(&stop_free);
|
||
|
||
double cpu_event_create_ms = get_time_ms() - cpu_event_create_start;
|
||
|
||
// === ИЗМЕРЕНИЕ cudaMalloc ===
|
||
cudaEventRecord(start_malloc);
|
||
|
||
GpuRecord* d_records = nullptr;
|
||
int* d_day_offsets = nullptr;
|
||
int* d_day_counts = nullptr;
|
||
long long* d_day_indices = nullptr;
|
||
GpuDayStats* d_out_stats = nullptr;
|
||
|
||
cudaError_t err;
|
||
|
||
err = cudaMalloc(&d_records, num_records * sizeof(GpuRecord));
|
||
if (err != cudaSuccess) return -1;
|
||
|
||
err = cudaMalloc(&d_day_offsets, num_days * sizeof(int));
|
||
if (err != cudaSuccess) { cudaFree(d_records); return -2; }
|
||
|
||
err = cudaMalloc(&d_day_counts, num_days * sizeof(int));
|
||
if (err != cudaSuccess) { cudaFree(d_records); cudaFree(d_day_offsets); return -3; }
|
||
|
||
err = cudaMalloc(&d_day_indices, num_days * sizeof(long long));
|
||
if (err != cudaSuccess) { cudaFree(d_records); cudaFree(d_day_offsets); cudaFree(d_day_counts); return -4; }
|
||
|
||
err = cudaMalloc(&d_out_stats, num_days * sizeof(GpuDayStats));
|
||
if (err != cudaSuccess) { cudaFree(d_records); cudaFree(d_day_offsets); cudaFree(d_day_counts); cudaFree(d_day_indices); return -5; }
|
||
|
||
cudaEventRecord(stop_malloc);
|
||
cudaEventSynchronize(stop_malloc);
|
||
|
||
float time_malloc_ms = 0;
|
||
cudaEventElapsedTime(&time_malloc_ms, start_malloc, stop_malloc);
|
||
|
||
// === ИЗМЕРЕНИЕ memcpy H->D ===
|
||
cudaEventRecord(start_transfer);
|
||
|
||
err = cudaMemcpy(d_records, h_records, num_records * sizeof(GpuRecord), cudaMemcpyHostToDevice);
|
||
if (err != cudaSuccess) return -10;
|
||
|
||
err = cudaMemcpy(d_day_offsets, h_day_offsets, num_days * sizeof(int), cudaMemcpyHostToDevice);
|
||
if (err != cudaSuccess) return -11;
|
||
|
||
err = cudaMemcpy(d_day_counts, h_day_counts, num_days * sizeof(int), cudaMemcpyHostToDevice);
|
||
if (err != cudaSuccess) return -12;
|
||
|
||
err = cudaMemcpy(d_day_indices, h_day_indices, num_days * sizeof(long long), cudaMemcpyHostToDevice);
|
||
if (err != cudaSuccess) return -13;
|
||
|
||
cudaEventRecord(stop_transfer);
|
||
cudaEventSynchronize(stop_transfer);
|
||
|
||
float time_transfer_ms = 0;
|
||
cudaEventElapsedTime(&time_transfer_ms, start_transfer, stop_transfer);
|
||
|
||
// === ИЗМЕРЕНИЕ kernel ===
|
||
const int THREADS_PER_BLOCK = 256;
|
||
int num_blocks = (num_days + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
|
||
|
||
cudaEventRecord(start_kernel);
|
||
|
||
aggregate_kernel<<<num_blocks, THREADS_PER_BLOCK>>>(
|
||
d_records, num_records,
|
||
d_day_offsets, d_day_counts, d_day_indices,
|
||
num_days, d_out_stats
|
||
);
|
||
|
||
err = cudaGetLastError();
|
||
if (err != cudaSuccess) {
|
||
cudaFree(d_records);
|
||
cudaFree(d_day_offsets);
|
||
cudaFree(d_day_counts);
|
||
cudaFree(d_day_indices);
|
||
cudaFree(d_out_stats);
|
||
return -7;
|
||
}
|
||
|
||
cudaEventRecord(stop_kernel);
|
||
cudaEventSynchronize(stop_kernel);
|
||
|
||
float time_kernel_ms = 0;
|
||
cudaEventElapsedTime(&time_kernel_ms, start_kernel, stop_kernel);
|
||
|
||
// === ИЗМЕРЕНИЕ memcpy D->H ===
|
||
cudaEventRecord(start_copy_back);
|
||
cudaMemcpy(h_out_stats, d_out_stats, num_days * sizeof(GpuDayStats), cudaMemcpyDeviceToHost);
|
||
cudaEventRecord(stop_copy_back);
|
||
cudaEventSynchronize(stop_copy_back);
|
||
|
||
float time_copy_back_ms = 0;
|
||
cudaEventElapsedTime(&time_copy_back_ms, start_copy_back, stop_copy_back);
|
||
|
||
// === ИЗМЕРЕНИЕ cudaFree ===
|
||
cudaEventRecord(start_free);
|
||
|
||
cudaFree(d_records);
|
||
cudaFree(d_day_offsets);
|
||
cudaFree(d_day_counts);
|
||
cudaFree(d_day_indices);
|
||
cudaFree(d_out_stats);
|
||
|
||
cudaEventRecord(stop_free);
|
||
cudaEventSynchronize(stop_free);
|
||
|
||
float time_free_ms = 0;
|
||
cudaEventElapsedTime(&time_free_ms, start_free, stop_free);
|
||
|
||
// Общее время GPU
|
||
float time_total_ms = time_malloc_ms + time_transfer_ms + time_kernel_ms + time_copy_back_ms + time_free_ms;
|
||
|
||
// === Освобождаем события ===
|
||
double cpu_event_destroy_start = get_time_ms();
|
||
|
||
cudaEventDestroy(start_malloc);
|
||
cudaEventDestroy(stop_malloc);
|
||
cudaEventDestroy(start_transfer);
|
||
cudaEventDestroy(stop_transfer);
|
||
cudaEventDestroy(start_kernel);
|
||
cudaEventDestroy(stop_kernel);
|
||
cudaEventDestroy(start_copy_back);
|
||
cudaEventDestroy(stop_copy_back);
|
||
cudaEventDestroy(start_free);
|
||
cudaEventDestroy(stop_free);
|
||
|
||
double cpu_event_destroy_ms = get_time_ms() - cpu_event_destroy_start;
|
||
double cpu_total_ms = get_time_ms() - cpu_total_start;
|
||
|
||
// Выводим детальную статистику
|
||
printf(" GPU Timings (%d records, %d days):\n", num_records, num_days);
|
||
printf(" cudaMalloc: %7.3f ms\n", time_malloc_ms);
|
||
printf(" memcpy H->D: %7.3f ms\n", time_transfer_ms);
|
||
printf(" kernel execution: %7.3f ms\n", time_kernel_ms);
|
||
printf(" memcpy D->H: %7.3f ms\n", time_copy_back_ms);
|
||
printf(" cudaFree: %7.3f ms\n", time_free_ms);
|
||
printf(" GPU TOTAL: %7.3f ms\n", cpu_total_ms);
|
||
fflush(stdout);
|
||
|
||
return 0;
|
||
}
|