From 1214c1a7186f37577eb6091f975fcd74327dcc59 Mon Sep 17 00:00:00 2001 From: Kartik Samnotra Date: Tue, 21 Apr 2026 06:53:05 +0000 Subject: [PATCH 1/3] Fix download error in census_county_business_patterns --- .../census_county_business_patterns/main.py | 52 +++++++++++-------- 1 file changed, 31 insertions(+), 21 deletions(-) diff --git a/scripts/census_county_business_patterns/main.py b/scripts/census_county_business_patterns/main.py index 5d7a427167..1943d112b6 100644 --- a/scripts/census_county_business_patterns/main.py +++ b/scripts/census_county_business_patterns/main.py @@ -107,29 +107,39 @@ def download_files(): filename = name_template.format(last_two_digits_formatted) url = url_template.format(year, last_two_digits_formatted) logging.info(f"downloading url: {url}") - response = retry_method(url) - zip_content_stream = io.BytesIO(response.content) - with zipfile.ZipFile(zip_content_stream, 'r') as zip_ref: - for member in zip_ref.namelist(): - if not member.endswith('/') and member.lower().endswith( - '.txt'): - extract_path = os.path.join(_LOCAL_OUTPUT_PATH, - os.path.basename(member)) - abs_extract_path = os.path.abspath(extract_path) - abs_target_dir = os.path.abspath(_LOCAL_OUTPUT_PATH) - if not abs_extract_path.startswith(abs_target_dir): - logging.info( - f" WARNING: Path traversal attempt detected for '{member}'. Skipping." - ) - continue # Skip this member to prevent security risk + try: + response = retry_method(url) + zip_content_stream = io.BytesIO(response.content) + with zipfile.ZipFile(zip_content_stream, 'r') as zip_ref: + for member in zip_ref.namelist(): + if not member.endswith('/') and member.lower().endswith( + '.txt'): + extract_path = os.path.join( + _LOCAL_OUTPUT_PATH, + os.path.join(_LOCAL_OUTPUT_PATH, + os.path.basename(member))) + abs_extract_path = os.path.abspath(extract_path) + abs_target_dir = os.path.abspath(_LOCAL_OUTPUT_PATH) + + if not abs_extract_path.startswith(abs_target_dir): + logging.info( + f" WARNING: Path traversal attempt detected for '{member}'. Skipping." + ) + continue # Read the file content from the in-memory zip and write it to disk - with open(extract_path, 'wb') as outfile: - outfile.write(zip_ref.read(member)) - extracted_any_txt = True - else: - logging.info( - f"Skipping non-txt file/folder in zip: '{member}'") + with open(extract_path, 'wb') as outfile: + outfile.write(zip_ref.read(member)) + else: + logging.info( + f" Skipping non-txt file/folder in zip: '{member}'" + ) + except (requests.exceptions.RequestException, + zipfile.BadZipFile) as e: + logging.warning( + f"An error occurred while downloading {url}: {e}. Skipping." + ) + continue def main(argv): From 5479e265d78a835c923ab9d812ddb0b1e8df3434 Mon Sep 17 00:00:00 2001 From: Kartik Samnotra Date: Wed, 6 May 2026 08:22:21 +0000 Subject: [PATCH 2/3] fixed historical handling --- .../census_county_business_patterns/main.py | 20 +++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/scripts/census_county_business_patterns/main.py b/scripts/census_county_business_patterns/main.py index 1943d112b6..4d92fbdb4b 100644 --- a/scripts/census_county_business_patterns/main.py +++ b/scripts/census_county_business_patterns/main.py @@ -98,6 +98,8 @@ def retry_method(url, headers=None): def download_files(): start_year = FLAGS.data_start_year end_year = FLAGS.data_end_year + # The latest year being attempted is end_year - 2 (e.g., 2026 - 2 = 2024) + latest_year = end_year - 2 for year in range(start_year, end_year - 1): last_two_digits_formatted = f"{year % 100:02d}" @@ -136,10 +138,20 @@ def download_files(): ) except (requests.exceptions.RequestException, zipfile.BadZipFile) as e: - logging.warning( - f"An error occurred while downloading {url}: {e}. Skipping." - ) - continue + # Check if this is the latest year which might not be published yet (404) + is_404 = (isinstance(e, requests.exceptions.HTTPError) and + e.response.status_code == 404) + if year == latest_year and is_404: + logging.warning( + f"Latest year {year} not yet available at {url}. Skipping." + ) + continue + else: + # For historical years or non-404 errors, we want the script to fail + logging.error( + f"Critical failure: Could not download historical data for {year} at {url}." + ) + raise e def main(argv): From c984039a457d80041305cd4f8853c13b38b60f37 Mon Sep 17 00:00:00 2001 From: Kartik Samnotra Date: Wed, 6 May 2026 11:46:51 +0000 Subject: [PATCH 3/3] Updated Readme to include new download logic --- .../census_county_business_patterns/README.md | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/scripts/census_county_business_patterns/README.md b/scripts/census_county_business_patterns/README.md index f9921080bf..326e8fc48d 100644 --- a/scripts/census_county_business_patterns/README.md +++ b/scripts/census_county_business_patterns/README.md @@ -21,10 +21,21 @@ No preprocessing required. fully auto refresh # Script Execution Details -First, run `main.py` to download the data. -flags: data_start_year - this is the default flag which refers to the start year. - data_end_year - this is the default flag which refers to the current year. - output_dir - this is also the default flag which refers to the output directory for processed output from 'main.py' script. +The `main.py` script is used to download and process the source data. + +### Download and Processing Logic +The script iterates through a range of years from `data_start_year` up to `data_end_year - 2`. +- **Historical Data:** For years prior to the latest available data, the script expects the data to be present at the source. If a download fails for these years, it is considered a critical failure and the script will stop. +- **Latest Year Data:** The script attempts to download data for the latest year (calculated as `data_end_year - 2`). If this data is not yet published (returning a 404 error), the script will log a warning and skip it, allowing the import to proceed with available historical data. +- **File Extraction:** Source files are downloaded as ZIP archives, and only relevant `.txt` files are extracted for processing. +- **In-Memory Processing:** Extracted `.txt` files are loaded into memory and processed into CSV format suitable for sharding. + +### Flags +- `data_start_year`: The year to start downloading data from (default is 2016). +- `data_end_year`: The year to process data up to (default is the current year). The script attempts to download data up to `data_end_year - 2`. +- `output_dir`: The directory where processed CSV files will be saved. +- `test`: A boolean flag to run the script in test mode (default is False). + The `shard_input_csv.sh` script performs the following preprocessing steps: 1. Creates directories for shards, debug outputs, and final processed outputs. 2. Splits the large input CSV files into smaller shards of 500,000 rows each.