#include #include #include #include #include // CPU таймер в миллисекундах static double get_time_ms() { struct timespec ts; clock_gettime(CLOCK_MONOTONIC, &ts); return ts.tv_sec * 1000.0 + ts.tv_nsec / 1000000.0; } // Структуры данных (должны совпадать с C++ кодом) struct GpuRecord { double timestamp; double open; double high; double low; double close; double volume; }; struct GpuDayStats { long long day; double avg; double open_min; double open_max; double close_min; double close_max; long long count; }; extern "C" int gpu_is_available() { int n = 0; cudaError_t err = cudaGetDeviceCount(&n); if (err != cudaSuccess) return 0; if (n > 0) { // Инициализируем CUDA контекст заранее (cudaFree(0) форсирует инициализацию) cudaFree(0); } return (n > 0) ? 1 : 0; } // Kernel для агрегации (каждый поток обрабатывает один день) __global__ void aggregate_kernel( const GpuRecord* records, int num_records, const int* day_offsets, // начало каждого дня в массиве records const int* day_counts, // количество записей в каждом дне const long long* day_indices, // индексы дней int num_days, GpuDayStats* out_stats) { // Глобальный индекс потока = индекс дня int d = blockIdx.x * blockDim.x + threadIdx.x; if (d >= num_days) return; int offset = day_offsets[d]; int count = day_counts[d]; GpuDayStats stats; stats.day = day_indices[d]; stats.open_min = DBL_MAX; stats.open_max = -DBL_MAX; stats.close_min = DBL_MAX; stats.close_max = -DBL_MAX; stats.count = count; double avg_sum = 0.0; for (int i = 0; i < count; i++) { const GpuRecord& r = records[offset + i]; // Accumulate avg = (low + high) / 2 avg_sum += (r.low + r.high) / 2.0; // min/max Open if (r.open < stats.open_min) stats.open_min = r.open; if (r.open > stats.open_max) stats.open_max = r.open; // min/max Close if (r.close < stats.close_min) stats.close_min = r.close; if (r.close > stats.close_max) stats.close_max = r.close; } stats.avg = avg_sum / static_cast(count); out_stats[d] = stats; } // Функция агрегации, вызываемая из C++ extern "C" int gpu_aggregate_days( const GpuRecord* h_records, int num_records, const int* h_day_offsets, const int* h_day_counts, const long long* h_day_indices, int num_days, GpuDayStats* h_out_stats) { double cpu_total_start = get_time_ms(); // === Создаём CUDA события для измерения времени === double cpu_event_create_start = get_time_ms(); cudaEvent_t start_malloc, stop_malloc; cudaEvent_t start_transfer, stop_transfer; cudaEvent_t start_kernel, stop_kernel; cudaEvent_t start_copy_back, stop_copy_back; cudaEvent_t start_free, stop_free; cudaEventCreate(&start_malloc); cudaEventCreate(&stop_malloc); cudaEventCreate(&start_transfer); cudaEventCreate(&stop_transfer); cudaEventCreate(&start_kernel); cudaEventCreate(&stop_kernel); cudaEventCreate(&start_copy_back); cudaEventCreate(&stop_copy_back); cudaEventCreate(&start_free); cudaEventCreate(&stop_free); double cpu_event_create_ms = get_time_ms() - cpu_event_create_start; // === ИЗМЕРЕНИЕ cudaMalloc === cudaEventRecord(start_malloc); GpuRecord* d_records = nullptr; int* d_day_offsets = nullptr; int* d_day_counts = nullptr; long long* d_day_indices = nullptr; GpuDayStats* d_out_stats = nullptr; cudaError_t err; err = cudaMalloc(&d_records, num_records * sizeof(GpuRecord)); if (err != cudaSuccess) return -1; err = cudaMalloc(&d_day_offsets, num_days * sizeof(int)); if (err != cudaSuccess) { cudaFree(d_records); return -2; } err = cudaMalloc(&d_day_counts, num_days * sizeof(int)); if (err != cudaSuccess) { cudaFree(d_records); cudaFree(d_day_offsets); return -3; } err = cudaMalloc(&d_day_indices, num_days * sizeof(long long)); if (err != cudaSuccess) { cudaFree(d_records); cudaFree(d_day_offsets); cudaFree(d_day_counts); return -4; } err = cudaMalloc(&d_out_stats, num_days * sizeof(GpuDayStats)); if (err != cudaSuccess) { cudaFree(d_records); cudaFree(d_day_offsets); cudaFree(d_day_counts); cudaFree(d_day_indices); return -5; } cudaEventRecord(stop_malloc); cudaEventSynchronize(stop_malloc); float time_malloc_ms = 0; cudaEventElapsedTime(&time_malloc_ms, start_malloc, stop_malloc); // === ИЗМЕРЕНИЕ memcpy H->D === cudaEventRecord(start_transfer); err = cudaMemcpy(d_records, h_records, num_records * sizeof(GpuRecord), cudaMemcpyHostToDevice); if (err != cudaSuccess) return -10; err = cudaMemcpy(d_day_offsets, h_day_offsets, num_days * sizeof(int), cudaMemcpyHostToDevice); if (err != cudaSuccess) return -11; err = cudaMemcpy(d_day_counts, h_day_counts, num_days * sizeof(int), cudaMemcpyHostToDevice); if (err != cudaSuccess) return -12; err = cudaMemcpy(d_day_indices, h_day_indices, num_days * sizeof(long long), cudaMemcpyHostToDevice); if (err != cudaSuccess) return -13; cudaEventRecord(stop_transfer); cudaEventSynchronize(stop_transfer); float time_transfer_ms = 0; cudaEventElapsedTime(&time_transfer_ms, start_transfer, stop_transfer); // === ИЗМЕРЕНИЕ kernel === const int THREADS_PER_BLOCK = 256; int num_blocks = (num_days + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; cudaEventRecord(start_kernel); aggregate_kernel<<>>( d_records, num_records, d_day_offsets, d_day_counts, d_day_indices, num_days, d_out_stats ); err = cudaGetLastError(); if (err != cudaSuccess) { cudaFree(d_records); cudaFree(d_day_offsets); cudaFree(d_day_counts); cudaFree(d_day_indices); cudaFree(d_out_stats); return -7; } cudaEventRecord(stop_kernel); cudaEventSynchronize(stop_kernel); float time_kernel_ms = 0; cudaEventElapsedTime(&time_kernel_ms, start_kernel, stop_kernel); // === ИЗМЕРЕНИЕ memcpy D->H === cudaEventRecord(start_copy_back); cudaMemcpy(h_out_stats, d_out_stats, num_days * sizeof(GpuDayStats), cudaMemcpyDeviceToHost); cudaEventRecord(stop_copy_back); cudaEventSynchronize(stop_copy_back); float time_copy_back_ms = 0; cudaEventElapsedTime(&time_copy_back_ms, start_copy_back, stop_copy_back); // === ИЗМЕРЕНИЕ cudaFree === cudaEventRecord(start_free); cudaFree(d_records); cudaFree(d_day_offsets); cudaFree(d_day_counts); cudaFree(d_day_indices); cudaFree(d_out_stats); cudaEventRecord(stop_free); cudaEventSynchronize(stop_free); float time_free_ms = 0; cudaEventElapsedTime(&time_free_ms, start_free, stop_free); // Общее время GPU float time_total_ms = time_malloc_ms + time_transfer_ms + time_kernel_ms + time_copy_back_ms + time_free_ms; // === Освобождаем события === double cpu_event_destroy_start = get_time_ms(); cudaEventDestroy(start_malloc); cudaEventDestroy(stop_malloc); cudaEventDestroy(start_transfer); cudaEventDestroy(stop_transfer); cudaEventDestroy(start_kernel); cudaEventDestroy(stop_kernel); cudaEventDestroy(start_copy_back); cudaEventDestroy(stop_copy_back); cudaEventDestroy(start_free); cudaEventDestroy(stop_free); double cpu_event_destroy_ms = get_time_ms() - cpu_event_destroy_start; double cpu_total_ms = get_time_ms() - cpu_total_start; // Выводим детальную статистику printf(" GPU Timings (%d records, %d days):\n", num_records, num_days); printf(" cudaMalloc: %7.3f ms\n", time_malloc_ms); printf(" memcpy H->D: %7.3f ms\n", time_transfer_ms); printf(" kernel execution: %7.3f ms\n", time_kernel_ms); printf(" memcpy D->H: %7.3f ms\n", time_copy_back_ms); printf(" cudaFree: %7.3f ms\n", time_free_ms); printf(" GPU TOTAL: %7.3f ms\n", cpu_total_ms); fflush(stdout); return 0; }