-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathCARTmodel.py
More file actions
64 lines (55 loc) · 1.58 KB
/
CARTmodel.py
File metadata and controls
64 lines (55 loc) · 1.58 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
# -*- coding: utf-8 -*-
"""Untitled13.ipynb
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/1tWP7KMpk-I8Q4CwxDF7r_O46NkznT3Ye
"""
-- Example dataset: housing-like data
CREATE TABLE housing (
id SERIAL PRIMARY KEY,
rooms INT,
sqft INT,
age INT,
price FLOAT
);
-- Insert synthetic rows (~1M for demo; scale down if laptop is slow)
INSERT INTO housing (rooms, sqft, age, price)
SELECT
(random()*5)::INT + 1,
(random()*2000)::INT + 300,
(random()*50)::INT,
(random()*200000 + 50000)::INT
FROM generate_series(1, 1000000);
-- Function to compute variance reduction for a candidate split
CREATE OR REPLACE FUNCTION variance_reduction(
feature TEXT,
threshold FLOAT
) RETURNS FLOAT AS $$
DECLARE
total_var FLOAT;
left_var FLOAT;
right_var FLOAT;
n_total INT;
n_left INT;
n_right INT;
reduction FLOAT;
BEGIN
-- Total variance
EXECUTE format('SELECT VARIANCE(price), COUNT(*) FROM housing')
INTO total_var, n_total;
-- Left split variance
EXECUTE format('SELECT VARIANCE(price), COUNT(*) FROM housing WHERE %I <= $1', feature)
INTO left_var, n_left
USING threshold;
-- Right split variance
EXECUTE format('SELECT VARIANCE(price), COUNT(*) FROM housing WHERE %I > $1', feature)
INTO right_var, n_right
USING threshold;
-- Weighted child variance
reduction := total_var - (
(n_left::FLOAT/n_total)*COALESCE(left_var,0) +
(n_right::FLOAT/n_total)*COALESCE(right_var,0)
);
RETURN reduction;
END;
$$ LANGUAGE plpgsql;