supercomputers/upsample.py

#!/usr/bin/env python3
import argparse
import sys
import time


def parse_row(line: str):
    # Timestamp,Open,High,Low,Close,Volume
    ts, o, h, l, c, v = line.split(',')
    return int(float(ts)), float(o), float(h), float(l), float(c), float(v)


def fmt_row(ts, o, h, l, c, v):
    return f"{ts},{o:.2f},{h:.2f},{l:.2f},{c:.2f},{v:.8f}\n"


def count_lines_fast(path: str) -> int:
    with open(path, "rb") as f:
        return sum(1 for _ in f) - 1  # минус header


def main(inp, out, step, flush_every):
    # считаем количество строк для прогресса
    total_lines = count_lines_fast(inp)
    print(f"Total input rows: {total_lines:,}", file=sys.stderr)

    start_time = time.time()
    processed = 0
    last_report = start_time

    with open(inp, "r", buffering=8 * 1024 * 1024) as fin, \
         open(out, "w", buffering=8 * 1024 * 1024) as fout:

        fin.readline()  # пропускаем header
        fout.write("Timestamp,Open,High,Low,Close,Volume\n")

        first = fin.readline()
        if not first:
            return

        prev = parse_row(first.strip())

        out_buf = []
        out_rows = 0

        for line in fin:
            line = line.strip()
            if not line:
                continue

            cur = parse_row(line)

            t1, o1, h1, l1, c1, v1 = prev
            t2, o2, h2, l2, c2, v2 = cur

            dt = t2 - t1
            steps = dt // step

            if steps > 0:
                do = o2 - o1
                dh = h2 - h1
                dl = l2 - l1
                dc = c2 - c1
                dv = v2 - v1

                inv = 1.0 / steps
                for i in range(steps):
                    a = i * inv
                    out_buf.append(fmt_row(
                        t1 + i * step,
                        o1 + do * a,
                        h1 + dh * a,
                        l1 + dl * a,
                        c1 + dc * a,
                        v1 + dv * a
                    ))

                out_rows += steps

            prev = cur
            processed += 1

            # прогресс
            if processed % 100_000 == 0:
                now = time.time()
                if now - last_report >= 0.5:
                    pct = processed * 100.0 / total_lines
                    elapsed = now - start_time
                    speed = processed / elapsed if elapsed > 0 else 0
                    eta = (total_lines - processed) / speed if speed > 0 else 0

                    print(
                        f"\rprocessed: {processed:,} / {total_lines:,} "
                        f"({pct:5.1f}%) | "
                        f"out ~ {out_rows:,} | "
                        f"{speed:,.0f} rows/s | "
                        f"ETA {eta/60:5.1f} min",
                        end="",
                        file=sys.stderr,
                        flush=True,
                    )
                    last_report = now

            # сброс буфера
            if out_rows >= flush_every:
                fout.write("".join(out_buf))
                out_buf.clear()
                out_rows = 0

        # остатки
        if out_buf:
            fout.write("".join(out_buf))

        # последнюю строку пишем как есть
        t, o, h, l, c, v = prev
        fout.write(fmt_row(t, o, h, l, c, v))

    total_time = time.time() - start_time
    print(
        f"\nDone in {total_time/60:.1f} min",
        file=sys.stderr
    )


if __name__ == "__main__":
    ap = argparse.ArgumentParser()
    ap.add_argument("-i", "--input", required=True)
    ap.add_argument("-o", "--output", required=True)
    ap.add_argument("-s", "--step", type=int, default=10)
    ap.add_argument("--flush-every", type=int, default=200_000)
    args = ap.parse_args()

    if args.step <= 0:
        raise SystemExit("step must be > 0")

    main(args.input, args.output, args.step, args.flush_every)