From 9e81174a6da2fd1fa922cbd8681ba8b12af03c2c Mon Sep 17 00:00:00 2001 From: Arity-T Date: Wed, 17 Dec 2025 13:47:29 +0000 Subject: [PATCH] =?UTF-8?q?=D0=A2=D0=B5=D1=81=D1=82=D0=BE=D0=B2=D1=8B?= =?UTF-8?q?=D0=B5=20=D1=8F=D0=B4=D1=80=D0=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/gpu_plugin.cu | 116 +++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 110 insertions(+), 6 deletions(-) diff --git a/src/gpu_plugin.cu b/src/gpu_plugin.cu index 9a55f53..ab7dfb8 100644 --- a/src/gpu_plugin.cu +++ b/src/gpu_plugin.cu @@ -199,6 +199,96 @@ __global__ void aggregate_periods_simple_kernel( out_stats[period_idx] = stats; } + + +static __device__ __forceinline__ double warp_reduce_sum(double v) { + for (int offset = 16; offset > 0; offset >>= 1) + v += __shfl_down_sync(0xffffffff, v, offset); + return v; +} +static __device__ __forceinline__ double warp_reduce_min(double v) { + for (int offset = 16; offset > 0; offset >>= 1) + v = fmin(v, __shfl_down_sync(0xffffffff, v, offset)); + return v; +} +static __device__ __forceinline__ double warp_reduce_max(double v) { + for (int offset = 16; offset > 0; offset >>= 1) + v = fmax(v, __shfl_down_sync(0xffffffff, v, offset)); + return v; +} +static __device__ __forceinline__ int warp_reduce_sum_int(int v) { + for (int offset = 16; offset > 0; offset >>= 1) + v += __shfl_down_sync(0xffffffff, v, offset); + return v; +} + +__global__ void aggregate_periods_warp_kernel( + const double* __restrict__ open, + const double* __restrict__ high, + const double* __restrict__ low, + const double* __restrict__ close, + const int64_t* __restrict__ unique_periods, + const int* __restrict__ offsets, + const int* __restrict__ counts, + int num_periods, + GpuPeriodStats* __restrict__ out_stats) +{ + int global_thread = blockIdx.x * blockDim.x + threadIdx.x; + int warp_id = global_thread >> 5; // /32 + int lane = threadIdx.x & 31; // %32 + int period_idx = warp_id; + + if (period_idx >= num_periods) return; + + int offset = offsets[period_idx]; + int count = counts[period_idx]; + + // Локальные аккумуляторы каждого lane + double sum_avg = 0.0; + double omin = DBL_MAX, omax = -DBL_MAX; + double cmin = DBL_MAX, cmax = -DBL_MAX; + int local_n = 0; + + // Каждый lane берёт i = lane, lane+32, lane+64... + for (int i = lane; i < count; i += 32) { + int tick = offset + i; + double avg = 0.5 * (low[tick] + high[tick]); + sum_avg += avg; + + double o = open[tick]; + double c = close[tick]; + + omin = fmin(omin, o); + omax = fmax(omax, o); + cmin = fmin(cmin, c); + cmax = fmax(cmax, c); + + local_n += 1; + } + + // Warp-редукция + sum_avg = warp_reduce_sum(sum_avg); + omin = warp_reduce_min(omin); + omax = warp_reduce_max(omax); + cmin = warp_reduce_min(cmin); + cmax = warp_reduce_max(cmax); + int n = warp_reduce_sum_int(local_n); // должно дать count, но так надёжнее + + // lane 0 пишет результат + if (lane == 0) { + GpuPeriodStats s; + s.period = unique_periods[period_idx]; + s.avg = (n > 0) ? (sum_avg / (double)n) : 0.0; + s.open_min = omin; + s.open_max = omax; + s.close_min = cmin; + s.close_max = cmax; + s.count = n; + out_stats[period_idx] = s; + } +} + + // ============================================================================ // Проверка доступности GPU // ============================================================================ @@ -208,7 +298,7 @@ extern "C" int gpu_is_available() { cudaError_t err = cudaGetDeviceCount(&n); if (err != cudaSuccess) return 0; if (n > 0) { - cudaFree(0); // Форсируем инициализацию контекста + // cudaFree(0); // Форсируем инициализацию контекста } return (n > 0) ? 1 : 0; } @@ -352,14 +442,28 @@ extern "C" int gpu_aggregate_periods( GpuPeriodStats* d_out_stats = nullptr; CUDA_CHECK(cudaMalloc(&d_out_stats, num_periods * sizeof(GpuPeriodStats))); - // Используем простой kernel (один поток на период) - // т.к. обычно тиков в периоде немного - int agg_blocks = (num_periods + BLOCK_SIZE - 1) / BLOCK_SIZE; - - aggregate_periods_simple_kernel<<>>( + // int agg_blocks = (num_periods + BLOCK_SIZE - 1) / BLOCK_SIZE; + // aggregate_periods_simple_kernel<<>>( + // d_open, d_high, d_low, d_close, + // d_unique_periods, d_offsets, d_counts, + // num_periods, d_out_stats); + + + // aggregate_periods_kernel<<>>( + // d_open, d_high, d_low, d_close, + // d_unique_periods, d_offsets, d_counts, + // num_periods, d_out_stats); + + int warps_per_block = BLOCK_SIZE / 32; // 8 + int num_blocks1 = (num_periods + warps_per_block - 1) / warps_per_block; + + aggregate_periods_warp_kernel<<>>( d_open, d_high, d_low, d_close, d_unique_periods, d_offsets, d_counts, num_periods, d_out_stats); + + + CUDA_CHECK(cudaGetLastError()); CUDA_CHECK(cudaDeviceSynchronize());