Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 54 additions & 0 deletions statvar_imports/tuberculosis_preventive_treatment/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# WHO Tuberculosis: Percentage of household contacts (or all close contacts) who were started on TB preventive treatment out of those eligible

## Overview
This dataset provides the percentage of household contacts (or close contacts) of people diagnosed with a new episode of bacteriologically confirmed pulmonary TB disease who started TB preventive treatment, out of those eligible.

## Data Source

**Source URL:**
https://data.who.int/indicators/i/45274BD/F5556F8

The data comes from the official WHO reporting database and includes comprehensive, country-level health metrics detailing annual Tuberculosis notifications and case classifications.

## How To Download Input Data
To download the data, you'll need to run the provided download script `tb_data_download_who.py`. This script automatically queries the WHO API for the indicator, merges it with the WHO geographical master list to append standard `iso3` country codes, and saves the cleaned `Tuberculosis_preventive_treatment_input.csv` file inside an "input_files" folder.

type of place: Country.

statvars: Health / Tuberculosis.

years: 2010 to 2022

place_resolution: manually.

release_frequency: P1Y

## Processing Instructions
To process the WHO Tuberculosis data and generate statistical variables, use the following commands from your root `data` directory:

**Download input file**
```bash
python3 statvar_imports/tuberculosis_preventive_treatment/tb_data_download_who.py
```

**For Test Data Run**
```bash
python3 tools/statvar_importer/stat_var_processor.py \
--input_data="statvar_imports/tuberculosis_preventive_treatment/testdata/Tuberculosis_preventive_treatment.csv" \
--pv_map="statvar_imports/tuberculosis_preventive_treatment/tuberculosis_PreventiveTreatment_pvmap.csv" \
--output_path="statvar_imports/tuberculosis_preventive_treatment/output_files/tuberculosis_PreventiveTreatment" \
--config_file="statvar_imports/tuberculosis_preventive_treatment/tuberculosis_PreventiveTreatment_metadata.csv" \
--existing_statvar_mcf="gs://unresolved_mcf/scripts/statvar/stat_vars.mcf"
```

**For Main data run**
```bash
python3 tools/statvar_importer/stat_var_processor.py \
--input_data="statvar_imports/tuberculosis_preventive_treatment/input_files/Tuberculosis_preventive_treatment.csv" \
--pv_map="statvar_imports/tuberculosis_preventive_treatment/tuberculosis_PreventiveTreatment_pvmap.csv" \
--output_path="statvar_imports/tuberculosis_preventive_treatment/output_files/tuberculosis_PreventiveTreatment" \
--config_file="statvar_imports/tuberculosis_preventive_treatment/tuberculosis_PreventiveTreatment_metadata.csv" \
--existing_statvar_mcf="gs://unresolved_mcf/scripts/statvar/stat_vars.mcf"
```

#### Refresh type: Fully Autorefresh
26 changes: 26 additions & 0 deletions statvar_imports/tuberculosis_preventive_treatment/manifest.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
{
"import_specifications": [
{
"import_name": "WHO_TuberculosisPreventiveTreatment",
"curator_emails": [
"support@datacommons.org"
],
"provenance_url": "https://data.who.int/indicators/i/45274BD/F5556F8",
"provenance_description": "Tuberculosis: Percentage of household contacts (or all close contacts) who were started on TB preventive treatment out of those eligible",
"scripts": [
"tb_data_download_who.py",
"../../../tools/statvar_importer/stat_var_processor.py --input_data=input_files/Tuberculosis_preventive_treatment.csv --pv_map=tuberculosis_PreventiveTreatment_pvmap.csv --config_file=tuberculosis_PreventiveTreatment_metadata.csv --output_path=output/tuberculosis_preventive_output --existing_statvar_mcf=gs://unresolved_mcf/scripts/statvar/stat_vars.mcf"
],
"import_inputs": [
{
"template_mcf": "output/tuberculosis_preventive_output.tmcf",
"cleaned_csv": "output/tuberculosis_preventive_output.csv"
}
],
"source_files": [
"input_files/Tuberculosis_preventive_treatment.csv"
],
"cron_schedule": "0 10 10,21 * *"
}
]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
import os
import requests
import io
import pandas as pd
import logging

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def download_tb_percentage_data():
# 1. Get the Clean Data from the API using the Indicator ID
api_url = "https://xmart-api-public.who.int/DATA_/RELAY_TB_DATA"
params = {
"$filter": "IND_ID eq '45274BDF5556F8'",
"$select": "IND_ID,INDICATOR_NAME,YEAR,COUNTRY,DISAGGR_1,VALUE",
"$format": "csv"
}

logging.info("1. Fetching clean percentage data from WHO API...")

try:
api_response = requests.get(api_url, params=params)
api_response.raise_for_status()
# Load the clean API data into a pandas table
api_df = pd.read_csv(io.StringIO(api_response.text))
except Exception as e:
logging.error(f"Failed to fetch or parse API data: {e}")
return

# 2. Get ONLY the iso3 code from the master database
logging.info("2. Fetching country iso3 codes from WHO master database...")
master_url = "https://extranet.who.int/tme/generateCSV.asp?ds=notifications"

try:
master_response = requests.get(master_url)
master_response.raise_for_status()
# Use low_memory=False to handle mixed types in large WHO datasets
master_df = pd.read_csv(io.StringIO(master_response.text), usecols=['country', 'iso3'], low_memory=False)
master_df = master_df.drop_duplicates()
except Exception as e:
logging.error(f"Failed to fetch or parse master data: {e}")
return

# 3. Merge the two datasets together based on the country name
logging.info("3. Merging data and formatting...")

# Ensure join keys are stripped of whitespace and have consistent casing for better matching
api_df['COUNTRY_MATCH'] = api_df['COUNTRY'].str.strip()
master_df['country_match'] = master_df['country'].str.strip()

merged_df = pd.merge(
api_df,
master_df,
left_on='COUNTRY_MATCH',
right_on='country_match',
how='left'
)

# Clean up temporary matching columns and the extra country name column
merged_df = merged_df.drop(columns=['COUNTRY_MATCH', 'country_match', 'country'])

# Ensure all expected columns exist before reordering (safety check)
final_columns = ['IND_ID', 'INDICATOR_NAME', 'YEAR', 'COUNTRY', 'iso3', 'DISAGGR_1', 'VALUE']
existing_columns = [col for col in final_columns if col in merged_df.columns]
merged_df = merged_df[existing_columns]

# 4. Save to CSV in a new folder
output_dir = "statvar_imports/tuberculosis_preventive_treatment/input_files"
filename = os.path.join(output_dir, "Tuberculosis_preventive_treatment.csv")

try:
os.makedirs(output_dir, exist_ok=True)
merged_df.to_csv(filename, index=False)
logging.info(f"Success! Data saved locally as '{filename}'")
except Exception as e:
logging.error(f"Failed to save CSV: {e}")

if __name__ == "__main__":
download_tb_percentage_data()
Loading