diff --git a/calculate_largest_expensors.sql b/calculate_largest_expensors.sql index e69de29..b491e3c 100644 --- a/calculate_largest_expensors.sql +++ b/calculate_largest_expensors.sql @@ -0,0 +1,24 @@ +USE memory.default; + +-- List of employees that expended more than 1000 units with their manager information. +CREATE OR REPLACE VIEW largest_expensors AS ( + SELECT + emp.employee_id + , concat(emp.first_name, ' ', emp.last_name) AS employee_name + , emp.manager_id + , concat(man.first_name, ' ', man.last_name) AS manager_name + , sum(exp.unit_price * exp.quantity) AS total_expensed_amount + FROM EMPLOYEE AS emp + INNER JOIN EXPENSE AS exp + ON emp.employee_id = exp.employee_id + LEFT JOIN EMPLOYEE AS man + ON man.employee_id = emp.manager_id + GROUP BY + emp.employee_id + , concat(emp.first_name, ' ', emp.last_name) + , emp.manager_id + , concat(man.first_name, ' ', man.last_name) + HAVING sum(exp.unit_price * exp.quantity) > 1000 +); + +SELECT * FROM largest_expensors ORDER BY total_expensed_amount DESC; \ No newline at end of file diff --git a/create_employees.sql b/create_employees.sql index e69de29..6360fad 100644 --- a/create_employees.sql +++ b/create_employees.sql @@ -0,0 +1,14 @@ +USE memory.default; + +-- EMPLOYEE table definition +CREATE TABLE EMPLOYEE ( + employee_id TINYINT NOT NULL, + first_name VARCHAR, + last_name VARCHAR, + job_title VARCHAR, + manager_id TINYINT +); + +-- Populate EMPLOYEE table with manual data from hr/employee_index.csv +INSERT INTO EMPLOYEE SELECT * FROM brz_employees +; diff --git a/create_expenses.sql b/create_expenses.sql index e69de29..d8852f8 100644 --- a/create_expenses.sql +++ b/create_expenses.sql @@ -0,0 +1,20 @@ +USE memory.default; + +-- EXPENSE table definition +CREATE TABLE EXPENSE ( + employee_id TINYINT, + unit_price DECIMAL(8, 2), + quantity TINYINT +); + +-- Populate EXPENSE table with data from brz_expenses and employee IDs from EMPLOYEE dimension table. +INSERT INTO EXPENSE + SELECT + emp.employee_id AS employee_id + , exp_t.unit_price AS unit_price + , exp_t.quantity AS quantity + FROM brz_expenses AS exp_t + INNER JOIN EMPLOYEE AS emp + ON lower(exp_t.employee_full_name) = lower(concat(emp.first_name, ' ', emp.last_name)) +; + diff --git a/create_invoices.sql b/create_invoices.sql index e69de29..ebe272d 100644 --- a/create_invoices.sql +++ b/create_invoices.sql @@ -0,0 +1,33 @@ +USE memory.default; + +-- INVOICE table definition +CREATE TABLE INVOICE ( + supplier_id TINYINT, + invoice_amount DECIMAL(8, 2), + due_date DATE +); + +-- SUPPLIER table definition +CREATE TABLE SUPPLIER ( + supplier_id TINYINT, + name VARCHAR +); + +-- Populate SUPPLIER table with +INSERT INTO SUPPLIER + SELECT + row_number() over(order by company_name) AS supplier_id + , company_name AS supplier_name + FROM (SELECT DISTINCT company_name FROM brz_invoices) +; + +-- Populate INVOICE table with data from brz_invoices and new supplier_id from SUPPLIER +INSERT INTO INVOICE + SELECT + sup.supplier_id AS supplier_id + , inv_t.invoice_amount AS invoice_amount + , last_day_of_month(date_add('month', inv_t.due_date_in_months, now())) AS due_date + FROM brz_invoices AS inv_t + INNER JOIN SUPPLIER AS sup + ON inv_t.company_name = sup.name +; \ No newline at end of file diff --git a/data_loading/employee_data_loading.py b/data_loading/employee_data_loading.py new file mode 100644 index 0000000..54f21f0 --- /dev/null +++ b/data_loading/employee_data_loading.py @@ -0,0 +1,48 @@ +import pandas as pd +import os + +def generate_sql_from_csv(): + """ + Reads employee data from CSV and generates SQL INSERT statements + to save in a .sql file. + """ + # Get the absolute path to the CSV file + file_path = os.path.join('hr', 'employee_index.csv') + + # Check if the file exists + if not os.path.exists(file_path): + print(f"Error: File not found at {file_path}") + print(f"Current working directory: {os.getcwd()}") + return False + + try: + # Read the CSV file into a pandas DataFrame + df = pd.read_csv(file_path) + + # Generate SQL INSERT statements + insert_statements = [] + for _, row in df.iterrows(): + values = f"({row['employee_id']}, '{row['first_name']}', '{row['last_name']}', '{row['job_title']}', {row['manager_id']})" + insert_statements.append(values) + + # Combine all INSERT statements + all_values = ",\n ".join(insert_statements) + sql_insert = f"""-- Insert data from CSV +INSERT INTO brz_employees (employee_id, first_name, last_name, job_title, manager_id) VALUES + {all_values}; +""" + + # Create or append to the SQL file + with open('data_loading\output_queries\brz_employees.sql', 'a') as f: + f.write(sql_insert) + + print(f"Successfully generated SQL INSERT statements for {len(df)} employee records") + print(f"Appended to employee_data.sql") + return True + + except Exception as e: + print(f"Error processing the data: {e}") + return False + +if __name__ == "__main__": + generate_sql_from_csv() \ No newline at end of file diff --git a/data_loading/expenses_data_loading.py b/data_loading/expenses_data_loading.py new file mode 100644 index 0000000..739570f --- /dev/null +++ b/data_loading/expenses_data_loading.py @@ -0,0 +1,63 @@ +import os +import re +import glob + +def parse_expense_file(file_path): + """Parse an expense receipt text file and extract relevant information.""" + with open(file_path, 'r') as file: + content = file.read() + + # Extract employee name + employee_match = re.search(r'Employee: (.+)', content) + employee_name = employee_match.group(1) if employee_match else "" + + # Extract unit price + price_match = re.search(r'Unit Price: (\d+\.\d+)', content) + unit_price = float(price_match.group(1)) if price_match else 0.0 + + # Extract quantity + quantity_match = re.search(r'Quantity: (\d+)', content) + quantity = int(quantity_match.group(1)) if quantity_match else 0 + + return { + 'employee_name': employee_name if employee_name is not None else "", + 'unit_price': unit_price, + 'quantity': quantity + } + +def generate_expenses_sql(): + """Generate SQL for brz_expenses table.""" + # Create table + sql = """-- Create brz_expenses table +USE memory.default; +DROP TABLE IF EXISTS brz_expenses; +CREATE TABLE brz_expenses ( + employee_full_name VARCHAR, + unit_price DECIMAL(8, 2), + quantity TINYINT +); + +-- Insert data from receipts_from_last_night/*.txt files +""" + + # Get all receipt files + receipt_files = glob.glob('finance\\receipts_from_last_night\\*.txt') + + # Process each file and generate INSERT statements + inserts = [] + for file_path in receipt_files: + expense_data = parse_expense_file(file_path) + insert = f"INSERT INTO brz_expenses (employee_full_name, unit_price, quantity) VALUES " \ + f"(\'{expense_data['employee_name']}\', {expense_data['unit_price']}, {expense_data['quantity']});" + inserts.append(insert) + + return sql + "\n".join(inserts) + +def main(): + # Generate SQL for expenses + expenses_sql = generate_expenses_sql() + with open('data_loading\\output_queries\\brz_expenses.sql', 'w') as file: + file.write(expenses_sql) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/data_loading/invoices_data_loading.py b/data_loading/invoices_data_loading.py new file mode 100644 index 0000000..9ab887d --- /dev/null +++ b/data_loading/invoices_data_loading.py @@ -0,0 +1,64 @@ +import os +import re +import glob +def parse_invoice_file(file_path): + """Parse an invoice text file and extract relevant information.""" + + with open(file_path, 'r') as file: + content = file.read() + + # Extract invoice amount + company_name_match = re.search(r'Company Name: (.+)', content) + company_name = str(company_name_match.group(1)) if company_name_match else "" + company_name = company_name.replace("\'", "\"") + amount_match = re.search(r'Invoice Amount: (\d+)', content) + invoice_amount = float(amount_match.group(1)) if amount_match else 0.0 + + # Extract due date in months + due_date_match = re.search(r'Due Date: (\d+) months', content) + due_date_in_months = int(due_date_match.group(1)) if due_date_match else 0 + + return { + 'company_name': company_name, + 'invoice_amount': invoice_amount, + 'due_date_in_months': due_date_in_months + } + +def generate_invoices_sql(): + """Generate SQL for brz_invoices table.""" + # Create table + sql = """-- Create brz_invoices table + USE memory.default; +DROP TABLE IF EXISTS brz_invoices; +CREATE TABLE brz_invoices ( + company_name VARCHAR(100), + invoice_amount DECIMAL(8, 2), + due_date_in_months INT +); + +-- Insert data from invoices_due/*.txt files +""" + + # Get all invoice files + invoice_files = glob.glob('finance\\invoices_due\\*.txt') + + # Process each file and generate INSERT statements + inserts = [] + for file_path in invoice_files: + invoice_data = parse_invoice_file(file_path) + insert = f"INSERT INTO brz_invoices (company_name, invoice_amount, due_date_in_months) VALUES " \ + f"('{invoice_data['company_name']}', {invoice_data['invoice_amount']}, {invoice_data['due_date_in_months']});" + inserts.append(insert) + + return sql + "\n".join(inserts) + +def main(): + """Generate SQL files for both tables.""" + # Generate SQL for invoices + invoices_sql = generate_invoices_sql() + with open('data_loading\\output_queries\\brz_invoices.sql', 'w') as file: + file.write(invoices_sql) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/data_loading/output_queries/brz_employees.sql b/data_loading/output_queries/brz_employees.sql new file mode 100644 index 0000000..74f9337 --- /dev/null +++ b/data_loading/output_queries/brz_employees.sql @@ -0,0 +1,11 @@ +-- Insert data from CSV +INSERT INTO brz_employees (employee_id, first_name, last_name, job_title, manager_id) VALUES + (1, 'Ian', 'James', 'CEO', 4), + (2, 'Umberto', 'Torrielli', 'CSO', 1), + (3, 'Alex', 'Jacobson', 'MD EMEA', 2), + (4, 'Darren', 'Poynton', 'CFO', 2), + (5, 'Tim', 'Beard', 'MD APAC', 2), + (6, 'Gemma', 'Dodd', 'COS', 1), + (7, 'Lisa', 'Platten', 'CHR', 6), + (8, 'Stefano', 'Camisaca', 'GM Activation', 2), + (9, 'Andrea', 'Ghibaudi', 'MD NAM', 2); diff --git a/data_loading/output_queries/brz_expenses.sql b/data_loading/output_queries/brz_expenses.sql new file mode 100644 index 0000000..221325a --- /dev/null +++ b/data_loading/output_queries/brz_expenses.sql @@ -0,0 +1,17 @@ +-- Create brz_expenses table +USE memory.default; +DROP TABLE IF EXISTS brz_expenses; +CREATE TABLE brz_expenses ( + employee_full_name VARCHAR, + unit_price DECIMAL(8, 2), + quantity TINYINT +); + +-- Insert data from receipts_from_last_night/*.txt files +INSERT INTO brz_expenses (employee_full_name, unit_price, quantity) VALUES ('Alex Jacobson', 6.5, 14); +INSERT INTO brz_expenses (employee_full_name, unit_price, quantity) VALUES ('Alex Jacobson', 11.0, 20); +INSERT INTO brz_expenses (employee_full_name, unit_price, quantity) VALUES ('Alex Jacobson', 22.0, 18); +INSERT INTO brz_expenses (employee_full_name, unit_price, quantity) VALUES ('Alex Jacobson', 13.0, 75); +INSERT INTO brz_expenses (employee_full_name, unit_price, quantity) VALUES ('Andrea Ghibaudi', 0.0, 1); +INSERT INTO brz_expenses (employee_full_name, unit_price, quantity) VALUES ('Darren Poynton', 40.0, 9); +INSERT INTO brz_expenses (employee_full_name, unit_price, quantity) VALUES ('Umberto Torrielli', 17.5, 4); \ No newline at end of file diff --git a/data_loading/output_queries/brz_invoices.sql b/data_loading/output_queries/brz_invoices.sql new file mode 100644 index 0000000..93576ab --- /dev/null +++ b/data_loading/output_queries/brz_invoices.sql @@ -0,0 +1,16 @@ +-- Create brz_invoices table + USE memory.default; +DROP TABLE IF EXISTS brz_invoices; +CREATE TABLE brz_invoices ( + company_name VARCHAR(100), + invoice_amount DECIMAL(8, 2), + due_date_in_months INT +); + +-- Insert data from invoices_due/*.txt files +INSERT INTO brz_invoices (company_name, invoice_amount, due_date_in_months) VALUES ('Party Animals', 6000.0, 3); +INSERT INTO brz_invoices (company_name, invoice_amount, due_date_in_months) VALUES ('Catering Plus', 2000.0, 2); +INSERT INTO brz_invoices (company_name, invoice_amount, due_date_in_months) VALUES ('Catering Plus', 1500.0, 3); +INSERT INTO brz_invoices (company_name, invoice_amount, due_date_in_months) VALUES ('Dave"s Discos', 500.0, 0); +INSERT INTO brz_invoices (company_name, invoice_amount, due_date_in_months) VALUES ('Entertainment tonight', 6000.0, 3); +INSERT INTO brz_invoices (company_name, invoice_amount, due_date_in_months) VALUES ('Ice Ice Baby', 4000.0, 6); \ No newline at end of file diff --git a/find_manager_cycles.sql b/find_manager_cycles.sql index e69de29..edda264 100644 --- a/find_manager_cycles.sql +++ b/find_manager_cycles.sql @@ -0,0 +1,46 @@ +USE memory.default; + +CREATE OR REPLACE VIEW manager_cycles AS ( SELECT * FROM ( + -- Create recursive MANAGERS_CTE with all the managers with their employee ID list. + WITH RECURSIVE MANAGERS_CTE ( + manager_id + , employee_id + , employee_id_list + , level + ) AS ( + -- Anchor: All managers + SELECT + manager_id + , employee_id + , cast(employee_id as VARCHAR) AS employee_id_list + , 0 as level + FROM EMPLOYEE + + UNION ALL + + -- Recursive: Add recursively the employees of each manager's employee + SELECT + man.manager_id + , emp.employee_id + , concat(man.employee_id_list, '; ', cast(emp.employee_id as VARCHAR)) AS employee_id_list + , level + 1 + FROM EMPLOYEE AS emp + -- Recursive over each employee if it is also a manager + INNER JOIN MANAGERS_CTE AS man + ON man.employee_id = emp.manager_id + -- Avoid infinite recursion + WHERE position(cast(emp.employee_id as VARCHAR) in man.employee_id_list) = 0 + ) + + -- List of employees that form a managing loop. + SELECT + manager_id + , max(employee_id_list) AS employee_id_list + , max(level) AS subordinates_level + FROM MANAGERS_CTE + -- Filter for managers that are in their own recursive list of employees (indicating a loop). + WHERE position(cast(manager_id as VARCHAR) in employee_id_list) <> 0 + GROUP BY manager_id +)); + +SELECT * FROM manager_cycles; \ No newline at end of file diff --git a/generate_supplier_payment_plans.sql b/generate_supplier_payment_plans.sql index e69de29..9ac989f 100644 --- a/generate_supplier_payment_plans.sql +++ b/generate_supplier_payment_plans.sql @@ -0,0 +1,50 @@ +USE memory.default; + +CREATE OR REPLACE VIEW payment_plans AS ( SELECT * FROM ( + -- Get information from INVOICE table and calculate remaining_months and total_payment for each invoice. + WITH invoices_cte AS ( + SELECT + inv.supplier_id + , sup.name as supplier_name + , inv.invoice_amount + , inv.due_date + , date_diff('month', now(), inv.due_date) + 1 AS remaining_months + , sum(inv.invoice_amount) over (partition by inv.supplier_id) AS total_payment + FROM INVOICE AS inv + LEFT JOIN SUPPLIER AS sup + on sup.supplier_id = inv.supplier_id + ), + -- Calculate the payment structure for each invoice. + payments_cte AS ( + SELECT + inv.supplier_id + , inv.supplier_name + , inv.invoice_amount / inv.remaining_months AS monthly_payment_amount + , last_day_of_month(date_add('month', remaining_months_list, now())) AS payment_date + , inv.total_payment + FROM invoices_cte AS inv + CROSS JOIN unnest(sequence(0, inv.remaining_months - 1)) AS t(remaining_months_list) + ), + -- Aggregate the payments for each invoice in case more than one invoice is for the same supplier. + aggregate_payments_cte AS ( + SELECT + supplier_id + , supplier_name + , sum(monthly_payment_amount) AS payment_amount + , payment_date + , total_payment + FROM payments_cte + group by supplier_id, supplier_name, payment_date, total_payment + ) + + -- Show results while calculating the balance after each payment. + SELECT DISTINCT + pay.supplier_id + , pay.supplier_name + , pay.payment_amount + , (pay.total_payment - sum(pay.payment_amount) over (partition by supplier_id order by payment_date)) AS balance_outstanding + , pay.payment_date + FROM aggregate_payments_cte AS pay +)); + +SELECT * FROM payment_plans ORDER BY supplier_id, supplier_name, payment_date; diff --git a/testing/data_testing/data_quality_testing.sql b/testing/data_testing/data_quality_testing.sql new file mode 100644 index 0000000..e3b5ec5 --- /dev/null +++ b/testing/data_testing/data_quality_testing.sql @@ -0,0 +1,18 @@ +-- =============================== +-- DATA QUALITY TESTS +-- =============================== + +USE memory.default; + +-- Referential integrity testing: +-- Expenses with non-existing employee_ids +SELECT COUNT(*) +FROM EXPENSE a +LEFT JOIN EMPLOYEE b ON a.employee_id = b.employee_id +WHERE b.employee_id IS NULL; + +-- Invoices with non-existing supplier_ids +SELECT COUNT(*) +FROM INVOICE a +LEFT JOIN SUPPLIER b ON a.supplier_id = b.supplier_id +WHERE b.supplier_id IS NULL; \ No newline at end of file diff --git a/testing/data_testing/data_validation_testing.sql b/testing/data_testing/data_validation_testing.sql new file mode 100644 index 0000000..e1e62ff --- /dev/null +++ b/testing/data_testing/data_validation_testing.sql @@ -0,0 +1,45 @@ +-- =============================== +-- VALIDATION TESTS +-- =============================== + +USE memory.default; + +-- Null tests for key columns: +SELECT + COUNT(*) AS total_rows, + SUM(CASE WHEN employee_id IS NULL THEN 1 ELSE 0 END) AS null_count, + CAST(SUM(CASE WHEN employee_id IS NULL THEN 1 ELSE 0 END) AS DOUBLE) / COUNT(*) AS null_percentage +FROM EXPENSE; + +SELECT + COUNT(*) AS total_rows, + SUM(CASE WHEN employee_id IS NULL THEN 1 ELSE 0 END) AS null_count, + CAST(SUM(CASE WHEN employee_id IS NULL THEN 1 ELSE 0 END) AS DOUBLE) / COUNT(*) AS null_percentage +FROM EMPLOYEE; + +SELECT + COUNT(*) AS total_rows, + SUM(CASE WHEN supplier_id IS NULL THEN 1 ELSE 0 END) AS null_count, + CAST(SUM(CASE WHEN supplier_id IS NULL THEN 1 ELSE 0 END) AS DOUBLE) / COUNT(*) AS null_percentage +FROM SUPPLIER; + +SELECT + COUNT(*) AS total_rows, + SUM(CASE WHEN supplier_id IS NULL THEN 1 ELSE 0 END) AS null_count, + CAST(SUM(CASE WHEN supplier_id IS NULL THEN 1 ELSE 0 END) AS DOUBLE) / COUNT(*) AS null_percentage +FROM INVOICE; + +-- Duplication tests for Key columns: +SELECT + employee_id + , COUNT(*) +FROM EMPLOYEE +GROUP BY employee_id +HAVING COUNT(*) > 1; + +SELECT + supplier_id + , COUNT(*) +FROM SUPPLIER +GROUP BY supplier_id +HAVING COUNT(*) > 1; \ No newline at end of file diff --git a/testing/query_testing/query_testing_largest_expensors.sql b/testing/query_testing/query_testing_largest_expensors.sql new file mode 100644 index 0000000..86c5a01 --- /dev/null +++ b/testing/query_testing/query_testing_largest_expensors.sql @@ -0,0 +1,27 @@ +-- ================================== +-- QUERY TESTING: largest_expensors +-- ================================== + +USE memory.default; + +-- Unexpected values +SELECT COUNT(*) AS unexpected_values +FROM largest_expensors +WHERE total_expensed_amount < 1000; + +-- Result count validation +SELECT + (SELECT COUNT(*) FROM largest_expensors) AS view_count, + (SELECT COUNT(DISTINCT emp.employee_id) + FROM EMPLOYEE AS emp + INNER JOIN EXPENSE AS exp ON emp.employee_id = exp.employee_id + GROUP BY emp.employee_id + HAVING SUM(exp.unit_price * exp.quantity) > 1000) AS direct_count, + CASE WHEN + (SELECT COUNT(*) FROM largest_expensors) = + (SELECT COUNT(DISTINCT emp.employee_id) + FROM EMPLOYEE AS emp + INNER JOIN EXPENSE AS exp ON emp.employee_id = exp.employee_id + GROUP BY emp.employee_id + HAVING SUM(exp.unit_price * exp.quantity) > 1000) + THEN 'PASS' ELSE 'FAIL' END AS test_result; \ No newline at end of file diff --git a/testing/query_testing/query_testing_manager_cycles.sql b/testing/query_testing/query_testing_manager_cycles.sql new file mode 100644 index 0000000..252ab1c --- /dev/null +++ b/testing/query_testing/query_testing_manager_cycles.sql @@ -0,0 +1,16 @@ +-- ================================== +-- QUERY TESTING: manager_cycles +-- ================================== + +USE memory.default; + +-- Validate list has the manager in it and has proper format +SELECT + 'Employee ID List Format Test' AS test_name, + manager_id, + employee_id_list, + -- Verify list contains the manager_id (indicating cycle) + CASE WHEN position(CAST(manager_id AS VARCHAR) IN employee_id_list) > 0 THEN 'PASS' ELSE 'FAIL' END AS contains_self, + -- Check for semicolons indicating proper concatenation + CASE WHEN position(';' IN employee_id_list) > 0 THEN 'PASS' ELSE 'NEEDS REVIEW' END AS format_check +FROM manager_cycles; \ No newline at end of file diff --git a/testing/query_testing/query_testing_payment_plans.sql b/testing/query_testing/query_testing_payment_plans.sql new file mode 100644 index 0000000..ee41846 --- /dev/null +++ b/testing/query_testing/query_testing_payment_plans.sql @@ -0,0 +1,32 @@ +-- ================================== +-- QUERY TESTING: payment_plans +-- ================================== + +-- Validate sum of payments equals the expected total +WITH supplier_invoice_totals AS ( + -- Calculate total invoice amount for each supplier + SELECT + supplier_id, + SUM(invoice_amount) AS expected_total + FROM INVOICE + GROUP BY supplier_id +), +supplier_payment_totals AS ( + -- Calculate total payments for each supplier from our view + SELECT + supplier_id, + SUM(payment_amount) AS actual_total + FROM payment_plans + GROUP BY supplier_id +) +SELECT + 'Payment Total Test' AS test_name, + e.supplier_id, + e.expected_total, + a.actual_total, + ABS(e.expected_total - a.actual_total) AS difference, + -- Allow for tiny rounding differences + CASE WHEN ABS(e.expected_total - a.actual_total) <= 0.01 THEN 'PASS' ELSE 'FAIL' END AS result +FROM supplier_invoice_totals e +JOIN supplier_payment_totals a ON e.supplier_id = a.supplier_id +ORDER BY e.supplier_id; \ No newline at end of file