33import argparse
44import json
55import os
6+ from importlib import util
67
78import pandas as pd
89
10+ plotly_found = util .find_spec ("plotly.express" ) is not None
11+
912
1013def compare_data_columns (
1114 files , name_column , data_column , info_cols , drop_column , debug = False
1215):
13- print ("\n compare_data_column: " + data_column )
16+ """
17+ Align concatenation by keys derived from info_cols instead of row order.
18+ - Pick one canonical key list: subset of info_cols present in ALL files.
19+ - For each file: set index to those keys, aggregate duplicates
20+ - (mean for metric, first for names).
21+ - Concat along axis=1 (indexes align), then reset_index so callers can
22+ - group by columns.
23+ - If --debug, add a <file_label>_name column per file.
24+ """
25+ print ("\n compare_data_column:" , data_column )
26+
1427 frames = []
1528 raw_data_cols = []
1629 compare_frames = []
30+
31+ # 1) choose a canonical key list from info_cols that exists in ALL files
32+ cols_per_file = []
33+ for f in files :
34+ try :
35+ df_tmp = pd .read_json (f , orient = "records" )
36+ except Exception as err :
37+ raise ValueError (f"Failed to read { f } " ) from err
38+ cols_per_file .append (set (df_tmp .columns ))
39+
40+ key_cols = [c for c in info_cols if all (c in cset for cset in cols_per_file )]
41+ if not key_cols :
42+ # soft fallback: use any info_cols present in the first file
43+ key_cols = [c for c in info_cols if c in list (cols_per_file [0 ])]
44+ if not key_cols :
45+ raise ValueError (
46+ "No common key columns found from info_cols across the input files."
47+ )
48+
49+ # 2) build a single "meta" block (keys as columns) once, aligned by the key index
50+ meta_added = False
51+
1752 for file in files :
18- data_df = pd .read_json (file )
19- serving_df = data_df .dropna (subset = [drop_column ], ignore_index = True )
20- # Show all info columns in the first couple columns
21- if not frames :
22- for col in info_cols :
23- if col not in serving_df .columns :
24- print (f"Skipping missing column: { col } " )
25- continue
26- frames .append (serving_df [col ])
27- # only show test name under debug mode
28- if debug is True :
29- serving_df = serving_df .rename (columns = {name_column : file + "_name" })
30- frames .append (serving_df [file + "_name" ])
31-
32- file = "/" .join (file .split ("/" )[:- 1 ])
33- serving_df = serving_df .rename (columns = {data_column : file })
34- frames .append (serving_df [file ])
35- raw_data_cols .append (file )
36- compare_frames .append (serving_df [file ])
53+ df = pd .read_json (file , orient = "records" )
54+
55+ # Keep rows that actually have the compared metric (same as original behavior)
56+ if drop_column in df .columns :
57+ df = df .dropna (subset = [drop_column ], ignore_index = True )
58+
59+ # Stabilize numeric key columns (harmless if missing)
60+ for c in (
61+ "Input Len" ,
62+ "Output Len" ,
63+ "TP Size" ,
64+ "PP Size" ,
65+ "# of max concurrency." ,
66+ "qps" ,
67+ ):
68+ if c in df .columns :
69+ df [c ] = pd .to_numeric (df [c ], errors = "coerce" )
70+
71+ # Ensure all key columns exist
72+ for c in key_cols :
73+ if c not in df .columns :
74+ df [c ] = pd .NA
75+
76+ # Set index = key_cols and aggregate duplicates → unique MultiIndex
77+ df_idx = df .set_index (key_cols , drop = False )
78+
79+ # meta (key columns), unique per key
80+ meta = df_idx [key_cols ]
81+ if not meta .index .is_unique :
82+ meta = meta .groupby (level = key_cols , dropna = False ).first ()
83+
84+ # metric series for this file, aggregated to one row per key
85+ file_label = "/" .join (file .split ("/" )[:- 1 ]) or os .path .basename (file )
86+ s = df_idx [data_column ]
87+ if not s .index .is_unique :
88+ s = s .groupby (level = key_cols , dropna = False ).mean ()
89+ s .name = file_label # column label like original
90+
91+ # add meta once (from first file) so keys are the leftmost columns
92+ if not meta_added :
93+ frames .append (meta )
94+ meta_added = True
95+
96+ # (NEW) debug: aligned test-name column per file
97+ if debug and name_column in df_idx .columns :
98+ name_s = df_idx [name_column ]
99+ if not name_s .index .is_unique :
100+ name_s = name_s .groupby (level = key_cols , dropna = False ).first ()
101+ name_s .name = f"{ file_label } _name"
102+ frames .append (name_s )
103+
104+ frames .append (s )
105+ raw_data_cols .append (file_label )
106+ compare_frames .append (s )
107+
108+ # Generalize ratio: for any file N>=2, add ratio (fileN / file1)
37109 if len (compare_frames ) >= 2 :
38- # Compare numbers among two files
39- ratio_df = compare_frames [1 ] / compare_frames [0 ]
40- frames .append (ratio_df )
41- compare_frames .pop (1 )
110+ base = compare_frames [0 ]
111+ current = compare_frames [- 1 ]
112+ ratio = current / base
113+ ratio = ratio .mask (base == 0 ) # avoid inf when baseline is 0
114+ ratio .name = f"Ratio 1 vs { len (compare_frames )} "
115+ frames .append (ratio )
42116
117+ # 4) concat on columns with aligned MultiIndex;
118+ # then reset_index to return keys as columns
43119 concat_df = pd .concat (frames , axis = 1 )
120+ concat_df = concat_df .reset_index (drop = True ).reset_index ()
121+ if "index" in concat_df .columns :
122+ concat_df = concat_df .drop (columns = ["index" ])
123+
124+ # Ensure key/info columns appear first (in your info_cols order)
125+ front = [c for c in info_cols if c in concat_df .columns ]
126+ rest = [c for c in concat_df .columns if c not in front ]
127+ concat_df = concat_df [front + rest ]
128+
44129 print (raw_data_cols )
45130 return concat_df , raw_data_cols
46131
@@ -67,6 +152,15 @@ def split_json_by_tp_pp(
67152
68153 df = pd .DataFrame (data )
69154
155+ # Keep only "serving" tests
156+ name_col = next (
157+ (c for c in ["Test name" , "test_name" , "Test Name" ] if c in df .columns ), None
158+ )
159+ if name_col :
160+ df = df [
161+ df [name_col ].astype (str ).str .contains (r"serving" , case = False , na = False )
162+ ].copy ()
163+
70164 # Handle alias column names
71165 rename_map = {
72166 "tp_size" : "TP Size" ,
@@ -181,16 +275,14 @@ def split_json_by_tp_pp(
181275 f"Expected subset: { filtered_info_cols } , "
182276 f"but DataFrame has: { list (output_df .columns )} "
183277 )
184-
185278 output_df_sorted = output_df .sort_values (by = existing_group_cols )
186279 output_groups = output_df_sorted .groupby (existing_group_cols , dropna = False )
187280 for name , group in output_groups :
188281 html = group .to_html ()
189282 text_file .write (html_msgs_for_data_cols [i ])
190283 text_file .write (html )
191284
192- if plot is True :
193- import pandas as pd
285+ if plot and plotly_found :
194286 import plotly .express as px
195287
196288 df = group [raw_data_cols ]
0 commit comments