diff --git a/calculate_largest_expensors.sql b/calculate_largest_expensors.sql index e69de29..4c6601c 100644 --- a/calculate_largest_expensors.sql +++ b/calculate_largest_expensors.sql @@ -0,0 +1,33 @@ +USE memory.default; + +/* +First we need to find all expenses and to do that, we are selecting from "EXPENSE" table. After than, we need to join +it to "EMPLOYEE" table and finally to "EMPLOYEE" table again to get actual employess and their managers. After that, it's just +a matter of a simple sum and filter. + +We use HAVING instead of WHERE because of SQL order of operations as HAVING is being evaluated after the actual aggreation, while +WHERE is evaluated before aggregation. +*/ +SELECT + employee.employee_id, + CONCAT(employee.first_name, ' ', employee.last_name) AS employee_name, + manager.manager_id, + CONCAT(manager.first_name, ' ', manager.last_name) AS manager_name, + SUM(expense.unit_price * expense.quantity) AS total_expensed_amount +FROM + EXPENSE expense +LEFT JOIN + EMPLOYEE employee + ON expense.employee_id = employee.employee_id +LEFT JOIN + EMPLOYEE manager + ON manager.employee_id = employee.manager_id +GROUP BY + employee.employee_id, + CONCAT(employee.first_name, ' ', employee.last_name), + manager.manager_id, + CONCAT(manager.first_name, ' ', manager.last_name) +HAVING + SUM(expense.unit_price * expense.quantity) > 1000 +ORDER BY + SUM(expense.unit_price * expense.quantity) DESC; \ No newline at end of file diff --git a/create_employees.sql b/create_employees.sql index e69de29..d90d1b6 100644 --- a/create_employees.sql +++ b/create_employees.sql @@ -0,0 +1,28 @@ +USE memory.default; + +/* +In real world, creating a table would not be so trivial (perhaps only for really small or manually created use-cases). +Another option on how to load data (from .csv, .parquet, etc.) would be to connect to an external location such as S3 or GCS or even +local file system and load from there. + +As I have time constraint on this task, I've decided to manually load data. +*/ +CREATE TABLE IF NOT EXISTS EMPLOYEE ( + employee_id TINYINT, + first_name VARCHAR, + last_name VARCHAR, + job_title VARCHAR, + manager_id TINYINT +); + +-- Data as shown in hr/employee_index.csv +INSERT INTO EMPLOYEE VALUES + (1, 'Ian', 'James', 'CEO', 4), + (2, 'Umberto', 'Torrielli', 'CSO', 1), + (3, 'Alex', 'Jacobson', 'MD EMEA', 2), + (4, 'Darren', 'Poynton', 'CFO', 2), + (5, 'Tim', 'Beard', 'MD APAC', 2), + (6, 'Gemma', 'Dodd', 'COS', 1), + (7, 'Lisa', 'Platten', 'CHR', 6), + (8, 'Stefano', 'Camisaca', 'GM Activation', 2), + (9, 'Andrea', 'Ghibaudi', 'MD NAM', 2); \ No newline at end of file diff --git a/create_expenses.sql b/create_expenses.sql index e69de29..e1e4e16 100644 --- a/create_expenses.sql +++ b/create_expenses.sql @@ -0,0 +1,20 @@ +USE memory.default; + +/* +Same as for create_employees.sql... +*/ +CREATE TABLE IF NOT EXISTS EXPENSE ( + employee_id TINYINT, + unit_price DECIMAL(8, 2), + quantity TINYINT +); + +-- Data as shown in finance/receipts_from_last_night/*.txt. Again, due to data sample size, I've simply manually looked up all of the employees ids and their respective names +INSERT INTO EXPENSE VALUES + (3, 6.50, 14), + (3, 11.00, 20), + (3, 22.00, 18), + (3, 13.00, 75), + (9, 300.00, 1), + (4, 40.00, 9), + (2, 17.50, 4); \ No newline at end of file diff --git a/create_invoices.sql b/create_invoices.sql index e69de29..16f8b95 100644 --- a/create_invoices.sql +++ b/create_invoices.sql @@ -0,0 +1,35 @@ +USE memory.default; + +/* +Same as for create_employees.sql... +*/ +CREATE TABLE IF NOT EXISTS SUPPLIER ( + supplier_id TINYINT, + name VARCHAR +); + +-- Data as shown in finance/invoices/*.txt +INSERT INTO SUPPLIER VALUES + (1, 'Catering Plus'), + (2, 'Dave''s Discos'), + (3, 'Entertainment tonight'), + (4, 'Ice Ice Baby'), + (5, 'Party Animals'); + +/* +Same as for create_employees.sql... +*/ +CREATE TABLE IF NOT EXISTS INVOICE ( + supplier_id TINYINT, + invoice_amount DECIMAL(8, 2), + due_date DATE +); + +-- Data as shown in finance/invoices/*.txt +INSERT INTO INVOICE VALUES + (5, 6000.00, LAST_DAY_OF_MONTH(DATE_ADD('month', 3, CURRENT_DATE))), + (1, 2000.00, LAST_DAY_OF_MONTH(DATE_ADD('month', 2, CURRENT_DATE))), + (1, 1500.00, LAST_DAY_OF_MONTH(DATE_ADD('month', 3, CURRENT_DATE))), + (2, 500.00, LAST_DAY_OF_MONTH(DATE_ADD('month', 1, CURRENT_DATE))), + (3, 6000.00, LAST_DAY_OF_MONTH(DATE_ADD('month', 3, CURRENT_DATE))), + (4, 4000.00, LAST_DAY_OF_MONTH(DATE_ADD('month', 6, CURRENT_DATE))); \ No newline at end of file diff --git a/find_manager_cycles.sql b/find_manager_cycles.sql index e69de29..b74fc99 100644 --- a/find_manager_cycles.sql +++ b/find_manager_cycles.sql @@ -0,0 +1,44 @@ +USE memory.default; + +-- We are using recursiveness here as this is a perfect use case +WITH RECURSIVE manager_loop ( + start_employee, + current_employee, + next_manager, + path +) AS ( + -- Anchor part of the query starts for every employee + SELECT + e.employee_id AS start_employee, + e.employee_id AS current_employee, + e.manager_id AS next_manager, + ARRAY[e.employee_id] AS path + FROM + EMPLOYEE e + + UNION ALL + + -- Where recursiveness starts by following the manager chain + SELECT + ml.start_employee, + e.employee_id AS current_employee, + e.manager_id AS next_manager, + ml.path || e.employee_id + FROM + manager_loop ml + JOIN + EMPLOYEE e + ON ml.next_manager = e.employee_id + WHERE + 1 = 1 + AND NOT CONTAINS(ml.path, e.employee_id) +) +-- In the final select, a cycle is found when the next manager is the same person that we started with +SELECT + ml.start_employee AS employee_id, + ARRAY_JOIN(ml.path || ml.next_manager, ', ') AS full_cycle_path +FROM + manager_loop ml +WHERE + 1 = 1 + AND next_manager = start_employee; \ No newline at end of file diff --git a/generate_supplier_payment_plans.sql b/generate_supplier_payment_plans.sql index e69de29..8fd3b3f 100644 --- a/generate_supplier_payment_plans.sql +++ b/generate_supplier_payment_plans.sql @@ -0,0 +1,63 @@ +USE memory.default; + +-- Aggregate all invoices for each supplier to get a total amount and final due date. +WITH supplier_invoice AS ( + SELECT + s.supplier_id, + s.name AS supplier_name, + CAST(SUM(i.invoice_amount) AS DECIMAL(10, 2)) AS sum_invoice_amt, + MAX(i.due_date) AS latest_due_date + FROM + INVOICE i + LEFT JOIN + SUPPLIER s + ON i.supplier_id = s.supplier_id + GROUP BY + s.supplier_id, + s.name +), + +-- Calculate the number of payments and the amount for a standard and final payment. +payment_details AS ( + SELECT + supplier_id, + supplier_name, + sum_invoice_amt, + DATE_DIFF('month', CURRENT_DATE, latest_due_date) + 1 AS payment_num, + CAST(FLOOR(sum_invoice_amt / (DATE_DIFF('month', CURRENT_DATE, latest_due_date) + 1)) AS DECIMAL(10, 2)) AS payment_amt_monthly, + CAST(MOD(sum_invoice_amt, (DATE_DIFF('month', CURRENT_DATE, latest_due_date) + 1)) AS DECIMAL(10, 2)) AS last_payment_adjustment + FROM + supplier_invoice +), + +-- Generate the payment schedule rows and calculate values for each month. +payment_schedule AS ( + SELECT + d.supplier_id, + d.supplier_name, + d.sum_invoice_amt, + -- Use a CASE statement to determine the payment amount for this specific row (seq) + CASE + WHEN seq = d.payment_num - 1 THEN d.payment_amt_monthly + d.last_payment_adjustment -- Last payment + ELSE d.payment_amt_monthly -- Rest of payments + END AS payment_amount, + LAST_DAY_OF_MONTH(DATE_ADD('month', seq, CURRENT_DATE)) AS payment_date, + seq + FROM + payment_details d + CROSS JOIN + UNNEST(SEQUENCE(0, d.payment_num - 1)) AS t(seq) +) +-- Calculate the running balance and display the final report. +SELECT + supplier_id, + supplier_name, + payment_amount, + -- Use a window function to get the running total of payments and subtract from the total + sum_invoice_amt - SUM(payment_amount) OVER (PARTITION BY supplier_id ORDER BY payment_date) AS balance_outstanding, + payment_date +FROM + payment_schedule +ORDER BY + supplier_id, + payment_date; \ No newline at end of file diff --git a/tests/00_referential_integrity.sql b/tests/00_referential_integrity.sql new file mode 100644 index 0000000..3e1303d --- /dev/null +++ b/tests/00_referential_integrity.sql @@ -0,0 +1,49 @@ +USE memory.default; + +-- Test 1: Check for expenses logged by non-existent employees. +-- This query identifies any employee_id in the EXPENSE table +-- that does not have a corresponding entry in the EMPLOYEE table. +-- An ideal result is an empty set, indicating no orphaned expense records. +SELECT +e.employee_id +FROM + EXPENSE e +LEFT JOIN + EMPLOYEE emp + ON e.employee_id = emp.employee_id +WHERE + 1 = 1 + AND emp.employee_id IS NULL; + +-- Test 2: Check for employees with non-existent managers. +-- This query checks for any manager_id in the EMPLOYEE table that does not +-- correspond to a valid employee_id in the same table. This is a self-referencing +-- foreign key check. The CEO's manager_id might be NULL, so we exclude that. +-- An ideal result is an empty set. +SELECT + emp.employee_id, + emp.manager_id +FROM + EMPLOYEE emp +LEFT JOIN + EMPLOYEE mgr + ON emp.manager_id = mgr.employee_id +WHERE + 1 = 1 + AND mgr.employee_id IS NULL + AND emp.manager_id IS NOT NULL; + +-- Test 3: Check for invoices from non-existent suppliers. +-- This query looks for any supplier_id in the INVOICE table that +-- does not exist in the SUPPLIER table. +-- A clean result (empty set) means all invoices are linked to valid suppliers. +SELECT + i.supplier_id +FROM + INVOICE i +LEFT JOIN + SUPPLIER s + ON i.supplier_id = s.supplier_id +WHERE + 1 = 1 + AND s.supplier_id IS NULL; \ No newline at end of file diff --git a/tests/01_data_validation.sql b/tests/01_data_validation.sql new file mode 100644 index 0000000..4561a75 --- /dev/null +++ b/tests/01_data_validation.sql @@ -0,0 +1,104 @@ +USE memory.default; + +/* +Test 1: Row Count Checks +These queries return the total number of rows in each table. +This is useful for tracking table growth and verifying data loads. +*/ +SELECT + 'EMPLOYEE' AS table_name, + COUNT(*) AS row_count +FROM + EMPLOYEE +UNION ALL +SELECT + 'EXPENSE' AS table_name, + COUNT(*) AS row_count +FROM + EXPENSE +UNION ALL +SELECT + 'SUPPLIER' AS table_name, + COUNT(*) AS row_count +FROM + SUPPLIER +UNION ALL +SELECT + 'INVOICE' AS table_name, + COUNT(*) AS row_count +FROM + INVOICE; + +/* +Test 2: Null Value Checks for Primary Keys +These queries check for NULL values in primary key columns. +These columns should never be null. An ideal result is 0. +*/ +SELECT + 'EMPLOYEE' AS table_name, + COUNT(*) AS null_employee_ids +FROM + EMPLOYEE +WHERE + 1 = 1 + AND employee_id IS NULL +UNION ALL +SELECT + 'SUPPLIER' AS table_name, + COUNT(*) AS null_supplier_ids +FROM + SUPPLIER +WHERE + 1 = 1 + AND supplier_id IS NULL; + +/* +Test 3: Uniqueness Checks for Primary Keys +These queries identify duplicate primary key values. +An ideal result is an empty set. +*/ +SELECT + employee_id, + COUNT(*) +FROM + EMPLOYEE +GROUP BY + employee_id +HAVING + COUNT(*) > 1; + +SELECT + supplier_id, + COUNT(*) +FROM + SUPPLIER +GROUP BY + supplier_id +HAVING + COUNT(*) > 1; + +/* +Test 4: Data Constraint Checks +This query checks for any records that violate logical data constraints, +such as negative prices or quantities. +An ideal result is an empty set. +*/ +SELECT + employee_id, + unit_price, + quantity +FROM + EXPENSE +WHERE + 1 = 1 + AND unit_price <= 0 OR quantity <= 0; + +-- Check for negative invoice amounts +SELECT + supplier_id, + invoice_amount +FROM + INVOICE +WHERE + 1 = 1 + AND invoice_amount <= 0; \ No newline at end of file