Files
supercomputers/upsample.py
2025-12-16 13:25:38 +00:00

137 lines
3.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
import argparse
import sys
import time
def parse_row(line: str):
# Timestamp,Open,High,Low,Close,Volume
ts, o, h, l, c, v = line.split(',')
return int(float(ts)), float(o), float(h), float(l), float(c), float(v)
def fmt_row(ts, o, h, l, c, v):
return f"{ts},{o:.2f},{h:.2f},{l:.2f},{c:.2f},{v:.8f}\n"
def count_lines_fast(path: str) -> int:
with open(path, "rb") as f:
return sum(1 for _ in f) - 1 # минус header
def main(inp, out, step, flush_every):
# считаем количество строк для прогресса
total_lines = count_lines_fast(inp)
print(f"Total input rows: {total_lines:,}", file=sys.stderr)
start_time = time.time()
processed = 0
last_report = start_time
with open(inp, "r", buffering=8 * 1024 * 1024) as fin, \
open(out, "w", buffering=8 * 1024 * 1024) as fout:
fin.readline() # пропускаем header
fout.write("Timestamp,Open,High,Low,Close,Volume\n")
first = fin.readline()
if not first:
return
prev = parse_row(first.strip())
out_buf = []
out_rows = 0
for line in fin:
line = line.strip()
if not line:
continue
cur = parse_row(line)
t1, o1, h1, l1, c1, v1 = prev
t2, o2, h2, l2, c2, v2 = cur
dt = t2 - t1
steps = dt // step
if steps > 0:
do = o2 - o1
dh = h2 - h1
dl = l2 - l1
dc = c2 - c1
dv = v2 - v1
inv = 1.0 / steps
for i in range(steps):
a = i * inv
out_buf.append(fmt_row(
t1 + i * step,
o1 + do * a,
h1 + dh * a,
l1 + dl * a,
c1 + dc * a,
v1 + dv * a
))
out_rows += steps
prev = cur
processed += 1
# прогресс
if processed % 100_000 == 0:
now = time.time()
if now - last_report >= 0.5:
pct = processed * 100.0 / total_lines
elapsed = now - start_time
speed = processed / elapsed if elapsed > 0 else 0
eta = (total_lines - processed) / speed if speed > 0 else 0
print(
f"\rprocessed: {processed:,} / {total_lines:,} "
f"({pct:5.1f}%) | "
f"out ~ {out_rows:,} | "
f"{speed:,.0f} rows/s | "
f"ETA {eta/60:5.1f} min",
end="",
file=sys.stderr,
flush=True,
)
last_report = now
# сброс буфера
if out_rows >= flush_every:
fout.write("".join(out_buf))
out_buf.clear()
out_rows = 0
# остатки
if out_buf:
fout.write("".join(out_buf))
# последнюю строку пишем как есть
t, o, h, l, c, v = prev
fout.write(fmt_row(t, o, h, l, c, v))
total_time = time.time() - start_time
print(
f"\nDone in {total_time/60:.1f} min",
file=sys.stderr
)
if __name__ == "__main__":
ap = argparse.ArgumentParser()
ap.add_argument("-i", "--input", required=True)
ap.add_argument("-o", "--output", required=True)
ap.add_argument("-s", "--step", type=int, default=10)
ap.add_argument("--flush-every", type=int, default=200_000)
args = ap.parse_args()
if args.step <= 0:
raise SystemExit("step must be > 0")
main(args.input, args.output, args.step, args.flush_every)