From 49b5476e2d373bb3cf547d413877747a7dae4cec Mon Sep 17 00:00:00 2001 From: Dylan McConnell Date: Mon, 1 Dec 2025 09:19:13 +1100 Subject: [PATCH] Removed accidentally added helper script --- unpack_helper.py | 220 ----------------------------------------------- 1 file changed, 220 deletions(-) delete mode 100644 unpack_helper.py diff --git a/unpack_helper.py b/unpack_helper.py deleted file mode 100644 index c5c209c..0000000 --- a/unpack_helper.py +++ /dev/null @@ -1,220 +0,0 @@ -import datetime -import tempfile -import timeit -import zipfile -from pathlib import Path - -import duckdb - -from isp_trace_parser import ( - demand_traces, - get_data, - optimise_parquet, - solar_traces, - wind_traces, -) - - -def list_files_in_zips(directory: str | Path) -> list[str]: - """Returns a list of all filenames contained within zip files in a directory. - - Args: - directory: Path to directory containing zip files. - - Returns: - List of filenames from all zip files. - """ - directory = Path(directory) - all_files = [] - - for zip_path in directory.glob("*.zip"): - with zipfile.ZipFile(zip_path, "r") as zf: - all_files.extend(f for f in zf.namelist() if not f.endswith("/")) - - return all_files - - -def process_resource_zips(): - a = datetime.datetime.now() - - root = Path("/home/dylan/Data/openISP/archive/openISP-server/isp_2024/") - solar_dir = root / "solar" - wind_dir = root / "wind" - - tmp_path = Path("/home/dylan/Data/openISP/archive/openISP-server/tmp") - - # Unpack all zips into tmp directory - for zip_path in solar_dir.glob("*.zip"): - print(zip_path) - with tempfile.TemporaryDirectory() as tmpdir: - with zipfile.ZipFile(zip_path, "r") as zf: - zf.extractall(tmpdir) - - for file_type in ["zone", "project"]: - filters = solar_traces.SolarMetadataFilter(file_type=[file_type]) - solar_traces.parse_solar_traces( - input_directory=tmpdir, - parsed_directory=tmp_path / file_type, - filters=filters, - ) - - for zip_path in wind_dir.glob("*.zip"): - print(zip_path) - with tempfile.TemporaryDirectory() as tmpdir: - with zipfile.ZipFile(zip_path, "r") as zf: - zf.extractall(tmpdir) - - for file_type in ["zone", "project"]: - filters = wind_traces.WindMetadataFilter(file_type=[file_type]) - wind_traces.parse_wind_traces( - input_directory=tmpdir, - parsed_directory=tmp_path / file_type, - filters=filters, - ) - - b = datetime.datetime.now() - - print(a) - print(b) - print(b - a) - - -def optimise(file_type="project"): - a = datetime.datetime.now() - - root = Path("/home/dylan/Data/openISP/archive/openISP-server/") - tmp_path = Path("/home/dylan/Data/openISP/archive/openISP-server/tmp") - - try: - optimise_parquet.partition_traces_by_columns( - input_directory=tmp_path / file_type, - output_directory=root / "processed" / file_type, - partition_cols=["reference_year"], - ) - except Exception as E: - print(E) - - b = datetime.datetime.now() - print(a) - print(b) - print(b - a) - - -# processed = full set -# processed2 = by partion, sorted -# processed3 = by parition, no sort -# raw = ~2k files - - -def test_get(): - root = Path("/home/dylan/Data/openISP/trace_archive/") - n = 10 - zones = ["N0", "N11", "N1", "N10", "N2", "N3", "N4", "N5", "N6", "N7", "N8"] - # zones = ["Q1"] - directories = [ - root / subdir / "zone" - for subdir in ["processed", "processed2", "processed3", "tmp"] - ] - for path in directories: - timer = timeit.Timer( - lambda: get_data.get_zone_single_reference_year( - start_year=2023, - end_year=2024, - reference_year=2018, - zone=zones, - resource_type="SAT", - directory=path, - ) - ) - runs = timer.repeat(repeat=n, number=n) - print(path, sum(runs) / n) - - -def process_demand_zips(): - a = datetime.datetime.now() - - root = Path("/home/dylan/Data/openISP/archive/openISP-server/isp_2024/") - demand_dir = root / "demand" - - tmp_path = Path("/home/dylan/Data/openISP/archive/openISP-server/tmp") - - # Unpack all zips into tmp directory - for zip_path in demand_dir.glob("*.zip"): - print(zip_path) - with tempfile.TemporaryDirectory() as tmpdir: - with zipfile.ZipFile(zip_path, "r") as zf: - zf.extractall(tmpdir) - demand_traces.parse_demand_traces( - input_directory=tmpdir, - parsed_directory=tmp_path / "demand", - use_concurrency=True, - ) - - b = datetime.datetime.now() - - print(a) - print(b) - print(b - a) - - -def optimise_demand(): - a = datetime.datetime.now() - root = Path("/home/dylan/Data/openISP/archive/openISP-server/") - tmp_path = Path("/home/dylan/Data/openISP/archive/openISP-server/tmp") - file_type = "demand" - try: - optimise_parquet.partition_traces_by_columns( - input_directory=tmp_path / file_type, - output_directory=root / "processed" / file_type, - partition_cols=["scenario", "reference_year"], - ) - except Exception as E: - print(E) - - b = datetime.datetime.now() - print(a) - print(b) - print(b - a) - - -def resource_splitter(file_type="project", split=275): - root_dir = Path("/home/dylan/Data/openISP/archive/openISP-server") - original_dir = root_dir / "processed" / file_type - split_dir = root_dir / "split" / file_type - - # assumed one level of partioning - for reference_year_dir in sorted(original_dir.iterdir()): - input_file = reference_year_dir / "data_0.parquet" - output_dir = split_dir / reference_year_dir.stem - - with duckdb.connect() as con: - # Run the COPY statement - con.execute(f""" - COPY - (SELECT * FROM read_parquet('{input_file}') ORDER BY DATETIME) - TO '{output_dir}' (FORMAT PARQUET, FILE_SIZE_BYTES '{split}MB') """) - print(reference_year_dir.stem) - - -def demand_splitter(): - file_type = "demand" - root_dir = Path("/home/dylan/Data/openISP/archive/openISP-server") - original_dir = root_dir / "processed" / file_type - split_dir = root_dir / "split" / file_type - - # assumes two level of partioning - for scenario_dir in sorted(original_dir.iterdir()): - for reference_year_dir in scenario_dir.iterdir(): - input_file = reference_year_dir / "data_0.parquet" - - (split_dir / scenario_dir.stem).mkdir(exist_ok=True) - - output_dir = split_dir / scenario_dir.stem / reference_year_dir.stem - - with duckdb.connect() as con: - # Run the COPY statement - con.execute(f""" - COPY - (SELECT * FROM read_parquet('{input_file}') ORDER BY DATETIME) - TO '{output_dir}' (FORMAT PARQUET, FILE_SIZE_BYTES '275MB') """) - print(scenario_dir.stem, reference_year_dir.stem)