From 9e81174a6da2fd1fa922cbd8681ba8b12af03c2c Mon Sep 17 00:00:00 2001
From: Arity-T <tish.mainmail@gmail.com>
Date: Wed, 17 Dec 2025 13:47:29 +0000
Subject: [PATCH] =?UTF-8?q?=D0=A2=D0=B5=D1=81=D1=82=D0=BE=D0=B2=D1=8B?=
 =?UTF-8?q?=D0=B5=20=D1=8F=D0=B4=D1=80=D0=B0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/gpu_plugin.cu | 116 +++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 110 insertions(+), 6 deletions(-)

diff --git a/src/gpu_plugin.cu b/src/gpu_plugin.cu
index 9a55f53..ab7dfb8 100644
--- a/src/gpu_plugin.cu
+++ b/src/gpu_plugin.cu
@@ -199,6 +199,96 @@ __global__ void aggregate_periods_simple_kernel(
     out_stats[period_idx] = stats;
 }
 
+
+
+static __device__ __forceinline__ double warp_reduce_sum(double v) {
+    for (int offset = 16; offset > 0; offset >>= 1)
+        v += __shfl_down_sync(0xffffffff, v, offset);
+    return v;
+}
+static __device__ __forceinline__ double warp_reduce_min(double v) {
+    for (int offset = 16; offset > 0; offset >>= 1)
+        v = fmin(v, __shfl_down_sync(0xffffffff, v, offset));
+    return v;
+}
+static __device__ __forceinline__ double warp_reduce_max(double v) {
+    for (int offset = 16; offset > 0; offset >>= 1)
+        v = fmax(v, __shfl_down_sync(0xffffffff, v, offset));
+    return v;
+}
+static __device__ __forceinline__ int warp_reduce_sum_int(int v) {
+    for (int offset = 16; offset > 0; offset >>= 1)
+        v += __shfl_down_sync(0xffffffff, v, offset);
+    return v;
+}
+
+__global__ void aggregate_periods_warp_kernel(
+    const double* __restrict__ open,
+    const double* __restrict__ high,
+    const double* __restrict__ low,
+    const double* __restrict__ close,
+    const int64_t* __restrict__ unique_periods,
+    const int* __restrict__ offsets,
+    const int* __restrict__ counts,
+    int num_periods,
+    GpuPeriodStats* __restrict__ out_stats)
+{
+    int global_thread = blockIdx.x * blockDim.x + threadIdx.x;
+    int warp_id       = global_thread >> 5;     // /32
+    int lane          = threadIdx.x & 31;       // %32
+    int period_idx    = warp_id;
+
+    if (period_idx >= num_periods) return;
+
+    int offset = offsets[period_idx];
+    int count  = counts[period_idx];
+
+    // Локальные аккумуляторы каждого lane
+    double sum_avg = 0.0;
+    double omin = DBL_MAX, omax = -DBL_MAX;
+    double cmin = DBL_MAX, cmax = -DBL_MAX;
+    int local_n = 0;
+
+    // Каждый lane берёт i = lane, lane+32, lane+64...
+    for (int i = lane; i < count; i += 32) {
+        int tick = offset + i;
+        double avg = 0.5 * (low[tick] + high[tick]);
+        sum_avg += avg;
+
+        double o = open[tick];
+        double c = close[tick];
+
+        omin = fmin(omin, o);
+        omax = fmax(omax, o);
+        cmin = fmin(cmin, c);
+        cmax = fmax(cmax, c);
+
+        local_n += 1;
+    }
+
+    // Warp-редукция
+    sum_avg = warp_reduce_sum(sum_avg);
+    omin    = warp_reduce_min(omin);
+    omax    = warp_reduce_max(omax);
+    cmin    = warp_reduce_min(cmin);
+    cmax    = warp_reduce_max(cmax);
+    int n   = warp_reduce_sum_int(local_n);   // должно дать count, но так надёжнее
+
+    // lane 0 пишет результат
+    if (lane == 0) {
+        GpuPeriodStats s;
+        s.period    = unique_periods[period_idx];
+        s.avg       = (n > 0) ? (sum_avg / (double)n) : 0.0;
+        s.open_min  = omin;
+        s.open_max  = omax;
+        s.close_min = cmin;
+        s.close_max = cmax;
+        s.count     = n;
+        out_stats[period_idx] = s;
+    }
+}
+
+
 // ============================================================================
 // Проверка доступности GPU
 // ============================================================================
@@ -208,7 +298,7 @@ extern "C" int gpu_is_available() {
     cudaError_t err = cudaGetDeviceCount(&n);
     if (err != cudaSuccess) return 0;
     if (n > 0) {
-        cudaFree(0);  // Форсируем инициализацию контекста
+        // cudaFree(0);  // Форсируем инициализацию контекста
     }
     return (n > 0) ? 1 : 0;
 }
@@ -352,14 +442,28 @@ extern "C" int gpu_aggregate_periods(
     GpuPeriodStats* d_out_stats = nullptr;
     CUDA_CHECK(cudaMalloc(&d_out_stats, num_periods * sizeof(GpuPeriodStats)));
     
-    // Используем простой kernel (один поток на период)
-    // т.к. обычно тиков в периоде немного
-    int agg_blocks = (num_periods + BLOCK_SIZE - 1) / BLOCK_SIZE;
-    
-    aggregate_periods_simple_kernel<<<agg_blocks, BLOCK_SIZE>>>(
+    // int agg_blocks = (num_periods + BLOCK_SIZE - 1) / BLOCK_SIZE;
+    // aggregate_periods_simple_kernel<<<agg_blocks, BLOCK_SIZE>>>(
+    //     d_open, d_high, d_low, d_close,
+    //     d_unique_periods, d_offsets, d_counts,
+    //     num_periods, d_out_stats);
+
+
+    // aggregate_periods_kernel<<<num_periods, BLOCK_SIZE>>>(
+    //     d_open, d_high, d_low, d_close,
+    //     d_unique_periods, d_offsets, d_counts,
+    //     num_periods, d_out_stats);
+
+    int warps_per_block = BLOCK_SIZE / 32;      // 8
+    int num_blocks1 = (num_periods + warps_per_block - 1) / warps_per_block;
+
+    aggregate_periods_warp_kernel<<<num_blocks1, BLOCK_SIZE>>>(
         d_open, d_high, d_low, d_close,
         d_unique_periods, d_offsets, d_counts,
         num_periods, d_out_stats);
+
+
+
     CUDA_CHECK(cudaGetLastError());
     CUDA_CHECK(cudaDeviceSynchronize());