llmq/scripts/export_wandb.py at dev · IST-DASLab/llmq · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
#!/usr/bin/env -S uv run --script
#
# /// script
# requires-python = ">=3.12"
# dependencies = ["wandb", "plotly[express]", "pandas"]
# ///


import argparse
import datetime
import json
from typing import Optional

import wandb


def log_line(run: "wandb.Run", entry: dict):
    kind = entry["log"]
    del entry["log"]
    step = entry["step"]
    del entry["step"]
    del entry["time"]  # TODO can we associate a datetime with step?
    if kind == "step":
        tps = entry["step_tokens"] / (entry["duration_ms"] / 1000)
        del entry["step_tokens"]
        run.log({f"train/{k}": v for k, v in entry.items()}, step=step)
        run.log({"train/tokens_per_second": tps}, step=step)
    elif kind == "eval":
        tps = entry["eval_tokens"] / (entry["duration_ms"] / 1000)
        del entry["eval_tokens"]
        run.log({f"eval/{k}": v for k, v in entry.items()}, step=step)
        run.log({"eval/tokens_per_second": tps}, step=step)
    elif kind == "gpu":
        del entry["throttle"]  # can't log this nicely?
        del entry["id"]        # not useful?
        if entry["fan"] == 0:  # indicates not recorded
            del entry["fan"]
        entry["dram_free"] /= 1024**2   # MiB
        entry["pcie_rx"] /= 1024**2     # MiB/s
        entry["pcie_tx"] /= 1024**2     # MiB/s
        run.log({f"gpu/{k}": v for k, v in entry.items()}, step=step)
    elif kind == "cmd":
        # TODO figure out if we can actually put this in the _wandb config object
        # where is belongs
        run.config["cmd"] = entry["cmd"]
    elif kind == "gpu-model":
        if entry["rank"] == 0:
            run.config["gpu"] = entry
        else:
            run.config[f"gpu-{entry['rank']}"] = entry
    elif kind == "allocator":
        import plotly.express as px
        names = [alloc["name"] for alloc in entry["stats"]]
        amounts = [round(alloc["device"] / 1024 / 1024, 1) for alloc in entry["stats"]]

        fig = px.pie(
            names=names,
            values=amounts,
            title=f"GPU Allocations",
        )
        run.log({"allocations": fig}, step=step)
    elif kind == "dataset":
        pass
        # run.config["dataset"] = entry
    elif kind in ["option", "info", "message"]:
        pass
    elif kind == "abs-maxes":
        for stats in entry["abs_maxes"]:
            run.log({f"abs_maxes/{stats['name']}": stats['value']}, step=step)
    elif kind == "sol":
        if entry["rank"] != 0:
            return
        import plotly.express as px
        names = ["Blocks", "LM-Head", "Attention"]
        amounts = [entry["blocks"], entry["lm_head"], entry["attention"]]

        fig = px.pie(
            names=names,
            values=amounts,
            title=f"FLOPs",
        )
        run.log({"ops": fig}, step=step)
    else:
        raise RuntimeError(f"Unknown kind {kind}")

def convert_log(file_name: str, *, name: Optional[str], project: str, notes: str="", tags: list[str] = None):
    log_data = json.load(file_name)

    if name is None:
        for entry in log_data:
            if entry["log"] == "option":
                opt_name = entry["name"]
                opt_value = entry["value"]
                if opt_name == "name":
                    name = opt_value

    with wandb.init(
            project=project,
            name=name,
            notes=notes,
            tags=tags,
    ) as run:
        for entry in log_data:
            log_line(run, entry)


def main():
    parser = argparse.ArgumentParser(description="Plot training run")
    parser.add_argument("--log-file", type=argparse.FileType("r"), help="Log file", default="log.json")
    parser.add_argument("--project", help="WandB project name")
    parser.add_argument("--name", help="Name for the run", default=None)
    args = parser.parse_args()
    convert_log(args.log_file, project=args.project, name=args.name)


if __name__ == "__main__":
    main()