diff --git a/scripts/us_nces/demographics/private_school/manifest.json b/scripts/us_nces/demographics/private_school/manifest.json deleted file mode 100644 index 195b29fb2e..0000000000 --- a/scripts/us_nces/demographics/private_school/manifest.json +++ /dev/null @@ -1,35 +0,0 @@ -{ - "import_specifications": [ - { - "import_name": "NCES_PrivateSchool", - "curator_emails": [ - "support@datacommons.org" - ], - "provenance_url": "https://nces.ed.gov/ccd/elsi/tableGenerator.aspx", - "provenance_description": "US nces school data for private", - "scripts": [ - "run.sh", - "process.py" - ], - "source_files": [ - "gcs_folder/input_files/*/*.csv" - ], - "import_inputs": [ - { - "template_mcf": "gcs_folder/output_files/us_nces_demographics_private_school.tmcf", - "cleaned_csv": "gcs_folder/output_files/us_nces_demographics_private_school.csv" - }, - { - "template_mcf": "gcs_folder/output_place/us_nces_demographics_private_place.tmcf", - "cleaned_csv": "gcs_folder/output_place/us_nces_demographics_private_place.csv" - } - ], - "cron_schedule": "30 3 1 3,6,9,12 *", - "resource_limits": { - "cpu": 32, - "memory": 512, - "disk": 300 - } - } - ] -} diff --git a/scripts/us_nces/demographics/private_school/private_school_place/manifest.json b/scripts/us_nces/demographics/private_school/private_school_place/manifest.json new file mode 100644 index 0000000000..ab23fb30df --- /dev/null +++ b/scripts/us_nces/demographics/private_school/private_school_place/manifest.json @@ -0,0 +1,26 @@ +{ + "import_specifications": [ + { + "import_name": "NCES_PrivateSchool", + "curator_emails": [ + "support@datacommons.org" + ], + "provenance_url": "https://nces.ed.gov/ccd/elsi/tableGenerator.aspx", + "provenance_description": "US nces school data for private", + "scripts": [ + "../run.sh", + "../process.py --place" + ], + "source_files": [ + "../gcs_folder/input_files/*/*.csv" + ], + "import_inputs": [ + { + "template_mcf": "../gcs_folder/output_place/us_nces_demographics_private_place.tmcf", + "cleaned_csv": "../gcs_folder/output_place/us_nces_demographics_private_place.csv" + } + ], + "cron_schedule": "30 3 1 3,6,9,12 *" + } + ] +} diff --git a/scripts/us_nces/demographics/private_school/private_school_stats/NCES_PrivateSchoolStats/manifest.json b/scripts/us_nces/demographics/private_school/private_school_stats/NCES_PrivateSchoolStats/manifest.json new file mode 100644 index 0000000000..5fb2dc29b6 --- /dev/null +++ b/scripts/us_nces/demographics/private_school/private_school_stats/NCES_PrivateSchoolStats/manifest.json @@ -0,0 +1,30 @@ +{ + "import_specifications": [ + { + "import_name": "NCES_PrivateSchoolStats", + "curator_emails": [ + "" + ], + "provenance_url": "https://nces.ed.gov/ccd/elsi/tableGenerator.aspx", + "provenance_description": "US nces school data for private", + "scripts": [ + "../../run.sh" + ], + "source_files": [ + "../../gcs_folder/input_files/*/*.csv" + ], + "import_inputs": [ + { + "template_mcf": "../../gcs_folder/output_files/us_nces_demographics_private_school.tmcf", + "cleaned_csv": "../../gcs_folder/output_files/us_nces_demographics_private_school.csv" + } + ], + "cron_schedule": "30 3 1 3,6,9,12 *", + "resource_limits": { + "cpu": 32, + "memory": 512, + "disk": 300 + } + } + ] +} diff --git a/scripts/us_nces/demographics/private_school/process.py b/scripts/us_nces/demographics/private_school/process.py index 5891e4e653..bec382656c 100644 --- a/scripts/us_nces/demographics/private_school/process.py +++ b/scripts/us_nces/demographics/private_school/process.py @@ -4,7 +4,7 @@ # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # -# http://www.apache.org/licenses/LICENSE-2.0 +# http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, @@ -22,25 +22,27 @@ """ import os -import shutil import sys -from absl import flags -from absl import app -from absl import logging import warnings +from absl import app, flags, logging warnings.simplefilter(action='ignore', category=FutureWarning) warnings.simplefilter(action='ignore', category=DeprecationWarning) -warnings.simplefilter(action='ignore', category=FutureWarning) + MODULE_DIR = os.path.dirname(__file__) sys.path.insert(1, MODULE_DIR + '/../..') from common.us_education import USEducation from config import * +# Define Flags +FLAGS = flags.FLAGS +flags.DEFINE_bool('stats', False, 'Generate Stats data (CSV, MCF, TMCF).') +flags.DEFINE_bool('place', False, 'Generate Place data (CSV, TMCF).') + class NCESPrivateSchool(USEducation): """ - This Class has requried methods to generate Cleaned CSV, + This Class has required methods to generate Cleaned CSV, MCF and TMCF Files. """ _import_name = SCHOOL_TYPE @@ -65,14 +67,23 @@ def set_generate_statvars_flag(self, flag: bool): self._generate_statvars = flag -if __name__ == '__main__': +def main(argv): + # Flag Validation: Throw error if no flags are used + if not FLAGS.stats and not FLAGS.place: + raise app.UsageError( + "No execution flag provided. You must specify either --stats or --place." + ) + try: logging.set_verbosity(logging.INFO) - logging.info("Main Method Starts For Private School District ") + logging.info("Main Method Starts For Private School District") + + # Path Setup gcs_output_dir_local = os.path.join( os.path.dirname(os.path.abspath(__file__)), "gcs_folder") input_path_base = os.path.join(gcs_output_dir_local, "input_files") os.makedirs(input_path_base, exist_ok=True) + input_files_to_process = [] if os.path.exists(input_path_base): for year_folder_name in sorted(os.listdir(input_path_base)): @@ -86,8 +97,10 @@ def set_generate_statvars_flag(self, flag: bool): if not input_files_to_process: logging.warning( - f"No CSV files found in {input_path_base} or its year subfolders. Please ensure download_input_files.py has been run and placed files correctly." + f"No CSV files found in {input_path_base}. Ensure download_input_files.py was run." ) + + # Output Directories output_file_path = os.path.join(gcs_output_dir_local, "output_files") os.makedirs(output_file_path, exist_ok=True) @@ -95,6 +108,7 @@ def set_generate_statvars_flag(self, flag: bool): "output_place") os.makedirs(output_file_path_place, exist_ok=True) + # File Paths cleaned_csv_path = os.path.join(output_file_path, CSV_FILE_NAME) mcf_path = os.path.join(output_file_path, MCF_FILE_NAME) tmcf_path = os.path.join(output_file_path, TMCF_FILE_NAME) @@ -103,13 +117,31 @@ def set_generate_statvars_flag(self, flag: bool): CSV_DUPLICATE_NAME) tmcf_path_place = os.path.join(output_file_path_place, TMCF_FILE_PLACE) + # Initialize Loader loader = NCESPrivateSchool(input_files_to_process, cleaned_csv_path, mcf_path, tmcf_path, cleaned_csv_place, duplicate_csv_place, tmcf_path_place) - loader.generate_csv() - loader.generate_mcf() - loader.generate_tmcf() - logging.info("Main Method Completed For Private School District ") + # Conditional Execution based on flags + if FLAGS.stats: + logging.info("Triggering Stats Import...") + loader.set_generate_statvars_flag(True) + loader.generate_csv() + loader.generate_mcf() + loader.generate_tmcf() + + if FLAGS.place: + logging.info("Triggering Place Import...") + # Disable statvars so processing focuses on entity/place data + loader.set_generate_statvars_flag(False) + loader.generate_csv() + loader.generate_tmcf() + + logging.info("Main Method Completed For Private School District") + except Exception as e: - logging.fatal(f"Error While Running Private School Process: {e} ") + logging.fatal(f"Error While Running Private School Process: {e}", exc_info=True) + + +if __name__ == '__main__': + app.run(main) diff --git a/scripts/us_nces/demographics/private_school/run.sh b/scripts/us_nces/demographics/private_school/run.sh index f4dfb017f2..f2a45a91e3 100644 --- a/scripts/us_nces/demographics/private_school/run.sh +++ b/scripts/us_nces/demographics/private_school/run.sh @@ -1,2 +1,14 @@ -mkdir -p gcs_folder/input_files -gcloud storage cp --recursive gs://unresolved_mcf/us_nces/demographics/private_school/semi_automation_input_files/* gcs_folder/input_files/ +#!/bin/bash + +# 1. Get the absolute path to the 'private_school' directory +BASE_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &> /dev/null && pwd) + +# 3. Create the actual data storage folder in the base directory +mkdir -p "$BASE_DIR/gcs_folder/input_files" + +# 4. Download files +gcloud storage cp --recursive "gs://unresolved_mcf/us_nces/demographics/private_school/semi_automation_input_files/*" "$BASE_DIR/gcs_folder/input_files/" + +# 5. Run the process +cd "$BASE_DIR" +python process.py --stats