#include <cuda_runtime.h>
#include <cstdint>
#include <cfloat>
#include <cstdio>
#include <ctime>

// CPU таймер в миллисекундах
static double get_time_ms() {
    struct timespec ts;
    clock_gettime(CLOCK_MONOTONIC, &ts);
    return ts.tv_sec * 1000.0 + ts.tv_nsec / 1000000.0;
}

// Структуры данных (должны совпадать с C++ кодом)
struct GpuRecord {
    double timestamp;
    double open;
    double high;
    double low;
    double close;
    double volume;
};

struct GpuDayStats {
    long long day;
    double avg;
    double open_min;
    double open_max;
    double close_min;
    double close_max;
    long long count;
};

extern "C" int gpu_is_available() {
    int n = 0;
    cudaError_t err = cudaGetDeviceCount(&n);
    if (err != cudaSuccess) return 0;
    if (n > 0) {
        // Инициализируем CUDA контекст заранее (cudaFree(0) форсирует инициализацию)
        cudaFree(0);
    }
    return (n > 0) ? 1 : 0;
}

// Kernel для агрегации (каждый поток обрабатывает один день)
__global__ void aggregate_kernel(
    const GpuRecord* records,
    int num_records,
    const int* day_offsets,    // начало каждого дня в массиве records
    const int* day_counts,     // количество записей в каждом дне
    const long long* day_indices, // индексы дней
    int num_days,
    GpuDayStats* out_stats)
{
    // Глобальный индекс потока = индекс дня
    int d = blockIdx.x * blockDim.x + threadIdx.x;
    
    if (d >= num_days) return;
    
    int offset = day_offsets[d];
    int count = day_counts[d];
    
    GpuDayStats stats;
    stats.day = day_indices[d];
    stats.open_min = DBL_MAX;
    stats.open_max = -DBL_MAX;
    stats.close_min = DBL_MAX;
    stats.close_max = -DBL_MAX;
    stats.count = count;
    
    double avg_sum = 0.0;
    
    for (int i = 0; i < count; i++) {
        const GpuRecord& r = records[offset + i];
        
        // Accumulate avg = (low + high) / 2
        avg_sum += (r.low + r.high) / 2.0;
        
        // min/max Open
        if (r.open < stats.open_min) stats.open_min = r.open;
        if (r.open > stats.open_max) stats.open_max = r.open;
        
        // min/max Close
        if (r.close < stats.close_min) stats.close_min = r.close;
        if (r.close > stats.close_max) stats.close_max = r.close;
    }
    
    stats.avg = avg_sum / static_cast<double>(count);
    out_stats[d] = stats;
}

// Функция агрегации, вызываемая из C++
extern "C" int gpu_aggregate_days(
    const GpuRecord* h_records,
    int num_records,
    const int* h_day_offsets,
    const int* h_day_counts,
    const long long* h_day_indices,
    int num_days,
    GpuDayStats* h_out_stats)
{
    double cpu_total_start = get_time_ms();
    
    // === Создаём CUDA события для измерения времени ===
    double cpu_event_create_start = get_time_ms();
    
    cudaEvent_t start_malloc, stop_malloc;
    cudaEvent_t start_transfer, stop_transfer;
    cudaEvent_t start_kernel, stop_kernel;
    cudaEvent_t start_copy_back, stop_copy_back;
    cudaEvent_t start_free, stop_free;
    
    cudaEventCreate(&start_malloc);
    cudaEventCreate(&stop_malloc);
    cudaEventCreate(&start_transfer);
    cudaEventCreate(&stop_transfer);
    cudaEventCreate(&start_kernel);
    cudaEventCreate(&stop_kernel);
    cudaEventCreate(&start_copy_back);
    cudaEventCreate(&stop_copy_back);
    cudaEventCreate(&start_free);
    cudaEventCreate(&stop_free);
    
    double cpu_event_create_ms = get_time_ms() - cpu_event_create_start;
    
    // === ИЗМЕРЕНИЕ cudaMalloc ===
    cudaEventRecord(start_malloc);
    
    GpuRecord* d_records = nullptr;
    int* d_day_offsets = nullptr;
    int* d_day_counts = nullptr;
    long long* d_day_indices = nullptr;
    GpuDayStats* d_out_stats = nullptr;
    
    cudaError_t err;
    
    err = cudaMalloc(&d_records, num_records * sizeof(GpuRecord));
    if (err != cudaSuccess) return -1;
    
    err = cudaMalloc(&d_day_offsets, num_days * sizeof(int));
    if (err != cudaSuccess) { cudaFree(d_records); return -2; }
    
    err = cudaMalloc(&d_day_counts, num_days * sizeof(int));
    if (err != cudaSuccess) { cudaFree(d_records); cudaFree(d_day_offsets); return -3; }
    
    err = cudaMalloc(&d_day_indices, num_days * sizeof(long long));
    if (err != cudaSuccess) { cudaFree(d_records); cudaFree(d_day_offsets); cudaFree(d_day_counts); return -4; }
    
    err = cudaMalloc(&d_out_stats, num_days * sizeof(GpuDayStats));
    if (err != cudaSuccess) { cudaFree(d_records); cudaFree(d_day_offsets); cudaFree(d_day_counts); cudaFree(d_day_indices); return -5; }
    
    cudaEventRecord(stop_malloc);
    cudaEventSynchronize(stop_malloc);
    
    float time_malloc_ms = 0;
    cudaEventElapsedTime(&time_malloc_ms, start_malloc, stop_malloc);
    
    // === ИЗМЕРЕНИЕ memcpy H->D ===
    cudaEventRecord(start_transfer);
    
    err = cudaMemcpy(d_records, h_records, num_records * sizeof(GpuRecord), cudaMemcpyHostToDevice);
    if (err != cudaSuccess) return -10;
    
    err = cudaMemcpy(d_day_offsets, h_day_offsets, num_days * sizeof(int), cudaMemcpyHostToDevice);
    if (err != cudaSuccess) return -11;
    
    err = cudaMemcpy(d_day_counts, h_day_counts, num_days * sizeof(int), cudaMemcpyHostToDevice);
    if (err != cudaSuccess) return -12;
    
    err = cudaMemcpy(d_day_indices, h_day_indices, num_days * sizeof(long long), cudaMemcpyHostToDevice);
    if (err != cudaSuccess) return -13;
    
    cudaEventRecord(stop_transfer);
    cudaEventSynchronize(stop_transfer);
    
    float time_transfer_ms = 0;
    cudaEventElapsedTime(&time_transfer_ms, start_transfer, stop_transfer);
    
    // === ИЗМЕРЕНИЕ kernel ===
    const int THREADS_PER_BLOCK = 256;
    int num_blocks = (num_days + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
    
    cudaEventRecord(start_kernel);
    
    aggregate_kernel<<<num_blocks, THREADS_PER_BLOCK>>>(
        d_records, num_records,
        d_day_offsets, d_day_counts, d_day_indices,
        num_days, d_out_stats
    );
    
    err = cudaGetLastError();
    if (err != cudaSuccess) {
        cudaFree(d_records);
        cudaFree(d_day_offsets);
        cudaFree(d_day_counts);
        cudaFree(d_day_indices);
        cudaFree(d_out_stats);
        return -7;
    }
    
    cudaEventRecord(stop_kernel);
    cudaEventSynchronize(stop_kernel);
    
    float time_kernel_ms = 0;
    cudaEventElapsedTime(&time_kernel_ms, start_kernel, stop_kernel);
    
    // === ИЗМЕРЕНИЕ memcpy D->H ===
    cudaEventRecord(start_copy_back);
    cudaMemcpy(h_out_stats, d_out_stats, num_days * sizeof(GpuDayStats), cudaMemcpyDeviceToHost);
    cudaEventRecord(stop_copy_back);
    cudaEventSynchronize(stop_copy_back);
    
    float time_copy_back_ms = 0;
    cudaEventElapsedTime(&time_copy_back_ms, start_copy_back, stop_copy_back);
    
    // === ИЗМЕРЕНИЕ cudaFree ===
    cudaEventRecord(start_free);
    
    cudaFree(d_records);
    cudaFree(d_day_offsets);
    cudaFree(d_day_counts);
    cudaFree(d_day_indices);
    cudaFree(d_out_stats);
    
    cudaEventRecord(stop_free);
    cudaEventSynchronize(stop_free);
    
    float time_free_ms = 0;
    cudaEventElapsedTime(&time_free_ms, start_free, stop_free);
    
    // Общее время GPU
    float time_total_ms = time_malloc_ms + time_transfer_ms + time_kernel_ms + time_copy_back_ms + time_free_ms;
    
    // === Освобождаем события ===
    double cpu_event_destroy_start = get_time_ms();
    
    cudaEventDestroy(start_malloc);
    cudaEventDestroy(stop_malloc);
    cudaEventDestroy(start_transfer);
    cudaEventDestroy(stop_transfer);
    cudaEventDestroy(start_kernel);
    cudaEventDestroy(stop_kernel);
    cudaEventDestroy(start_copy_back);
    cudaEventDestroy(stop_copy_back);
    cudaEventDestroy(start_free);
    cudaEventDestroy(stop_free);
    
    double cpu_event_destroy_ms = get_time_ms() - cpu_event_destroy_start;
    double cpu_total_ms = get_time_ms() - cpu_total_start;
    
    // Выводим детальную статистику
    printf("  GPU Timings (%d records, %d days):\n", num_records, num_days);
    printf("    cudaMalloc:           %7.3f ms\n", time_malloc_ms);
    printf("    memcpy H->D:          %7.3f ms\n", time_transfer_ms);
    printf("    kernel execution:     %7.3f ms\n", time_kernel_ms);
    printf("    memcpy D->H:          %7.3f ms\n", time_copy_back_ms);
    printf("    cudaFree:             %7.3f ms\n", time_free_ms);
    printf("    GPU TOTAL:            %7.3f ms\n", cpu_total_ms);
    fflush(stdout);
    
    return 0;
}