Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 0 additions & 35 deletions scripts/us_nces/demographics/private_school/manifest.json

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
{
"import_specifications": [
{
"import_name": "NCES_PrivateSchool",
"curator_emails": [
"support@datacommons.org"
],
"provenance_url": "https://nces.ed.gov/ccd/elsi/tableGenerator.aspx",
"provenance_description": "US nces school data for private",
"scripts": [
"../run.sh",
"../process.py --place"
],
"source_files": [
"../gcs_folder/input_files/*/*.csv"
],
"import_inputs": [
{
"template_mcf": "../gcs_folder/output_place/us_nces_demographics_private_place.tmcf",
"cleaned_csv": "../gcs_folder/output_place/us_nces_demographics_private_place.csv"
}
],
"cron_schedule": "30 3 1 3,6,9,12 *"
}
]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
{
"import_specifications": [
{
"import_name": "NCES_PrivateSchoolStats",
"curator_emails": [
""
],
"provenance_url": "https://nces.ed.gov/ccd/elsi/tableGenerator.aspx",
"provenance_description": "US nces school data for private",
"scripts": [
"../../run.sh"
],
"source_files": [
"../../gcs_folder/input_files/*/*.csv"
],
"import_inputs": [
{
"template_mcf": "../../gcs_folder/output_files/us_nces_demographics_private_school.tmcf",
"cleaned_csv": "../../gcs_folder/output_files/us_nces_demographics_private_school.csv"
}
],
Comment thread
smarthg-gi marked this conversation as resolved.
"cron_schedule": "30 3 1 3,6,9,12 *",
"resource_limits": {
"cpu": 32,
"memory": 512,
"disk": 300
}
}
]
}
62 changes: 47 additions & 15 deletions scripts/us_nces/demographics/private_school/process.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
Expand All @@ -22,25 +22,27 @@
"""

import os
import shutil
import sys
from absl import flags
from absl import app
from absl import logging
import warnings
from absl import app, flags, logging

warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=DeprecationWarning)
warnings.simplefilter(action='ignore', category=FutureWarning)

MODULE_DIR = os.path.dirname(__file__)
sys.path.insert(1, MODULE_DIR + '/../..')
from common.us_education import USEducation
from config import *

# Define Flags
FLAGS = flags.FLAGS
flags.DEFINE_bool('stats', False, 'Generate Stats data (CSV, MCF, TMCF).')
flags.DEFINE_bool('place', False, 'Generate Place data (CSV, TMCF).')


class NCESPrivateSchool(USEducation):
"""
This Class has requried methods to generate Cleaned CSV,
This Class has required methods to generate Cleaned CSV,
MCF and TMCF Files.
"""
_import_name = SCHOOL_TYPE
Expand All @@ -65,14 +67,23 @@ def set_generate_statvars_flag(self, flag: bool):
self._generate_statvars = flag


if __name__ == '__main__':
def main(argv):
# Flag Validation: Throw error if no flags are used
if not FLAGS.stats and not FLAGS.place:
raise app.UsageError(
"No execution flag provided. You must specify either --stats or --place."
)

try:
logging.set_verbosity(logging.INFO)
logging.info("Main Method Starts For Private School District ")
logging.info("Main Method Starts For Private School District")

# Path Setup
gcs_output_dir_local = os.path.join(
os.path.dirname(os.path.abspath(__file__)), "gcs_folder")
input_path_base = os.path.join(gcs_output_dir_local, "input_files")
os.makedirs(input_path_base, exist_ok=True)

input_files_to_process = []
if os.path.exists(input_path_base):
for year_folder_name in sorted(os.listdir(input_path_base)):
Expand All @@ -86,15 +97,18 @@ def set_generate_statvars_flag(self, flag: bool):

if not input_files_to_process:
logging.warning(
f"No CSV files found in {input_path_base} or its year subfolders. Please ensure download_input_files.py has been run and placed files correctly."
f"No CSV files found in {input_path_base}. Ensure download_input_files.py was run."
)

# Output Directories
output_file_path = os.path.join(gcs_output_dir_local, "output_files")
os.makedirs(output_file_path, exist_ok=True)

output_file_path_place = os.path.join(gcs_output_dir_local,
"output_place")
os.makedirs(output_file_path_place, exist_ok=True)

# File Paths
cleaned_csv_path = os.path.join(output_file_path, CSV_FILE_NAME)
mcf_path = os.path.join(output_file_path, MCF_FILE_NAME)
tmcf_path = os.path.join(output_file_path, TMCF_FILE_NAME)
Expand All @@ -103,13 +117,31 @@ def set_generate_statvars_flag(self, flag: bool):
CSV_DUPLICATE_NAME)
tmcf_path_place = os.path.join(output_file_path_place, TMCF_FILE_PLACE)

# Initialize Loader
loader = NCESPrivateSchool(input_files_to_process, cleaned_csv_path,
mcf_path, tmcf_path, cleaned_csv_place,
duplicate_csv_place, tmcf_path_place)

loader.generate_csv()
loader.generate_mcf()
loader.generate_tmcf()
logging.info("Main Method Completed For Private School District ")
# Conditional Execution based on flags
if FLAGS.stats:
logging.info("Triggering Stats Import...")
loader.set_generate_statvars_flag(True)
loader.generate_csv()
loader.generate_mcf()
loader.generate_tmcf()

if FLAGS.place:
logging.info("Triggering Place Import...")
# Disable statvars so processing focuses on entity/place data
loader.set_generate_statvars_flag(False)
loader.generate_csv()
loader.generate_tmcf()

logging.info("Main Method Completed For Private School District")

except Exception as e:
logging.fatal(f"Error While Running Private School Process: {e} ")
logging.fatal(f"Error While Running Private School Process: {e}", exc_info=True)


if __name__ == '__main__':
app.run(main)
16 changes: 14 additions & 2 deletions scripts/us_nces/demographics/private_school/run.sh
Original file line number Diff line number Diff line change
@@ -1,2 +1,14 @@
mkdir -p gcs_folder/input_files
gcloud storage cp --recursive gs://unresolved_mcf/us_nces/demographics/private_school/semi_automation_input_files/* gcs_folder/input_files/
#!/bin/bash

# 1. Get the absolute path to the 'private_school' directory
BASE_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &> /dev/null && pwd)

# 3. Create the actual data storage folder in the base directory
mkdir -p "$BASE_DIR/gcs_folder/input_files"

# 4. Download files
gcloud storage cp --recursive "gs://unresolved_mcf/us_nces/demographics/private_school/semi_automation_input_files/*" "$BASE_DIR/gcs_folder/input_files/"

# 5. Run the process
cd "$BASE_DIR"
python process.py --stats
Loading