From 3b1dc7cd0f5a696c5b807e4d17af6ba0ef34cf5b Mon Sep 17 00:00:00 2001 From: Igor Kvachenok Date: Tue, 28 Apr 2026 19:08:29 +0200 Subject: [PATCH 1/4] Add Feast feature store examples with MLflow integration --- feast/README.md | 81 ++++++++ feast/feast_example.ipynb | 400 ++++++++++++++++++++++++++++++++++++++ feast/feature_store.yaml | 22 +++ feast/features.py | 50 +++++ feast/featurestore.yaml | 45 +++++ 5 files changed, 598 insertions(+) create mode 100644 feast/README.md create mode 100644 feast/feast_example.ipynb create mode 100644 feast/feature_store.yaml create mode 100644 feast/features.py create mode 100644 feast/featurestore.yaml diff --git a/feast/README.md b/feast/README.md new file mode 100644 index 0000000..b43f3e3 --- /dev/null +++ b/feast/README.md @@ -0,0 +1,81 @@ +# Feast Feature Store Examples + +These examples demonstrate how to use [Feast](https://docs.feast.dev/) on prokube +for feature management in ML workflows. + +## Prerequisites + +- Feast must be enabled on your cluster (ask your admin) +- A `FeatureStore` CR must be created in your workspace (see below) +- The `feast-registry-config` secret must exist in your namespace + +## Quick Start + +### 1. Create a FeatureStore in your workspace + +Apply the example CR (edit the namespace): + +```bash +kubectl apply -f featurestore.yaml +``` + +Wait for it to become ready: + +```bash +kubectl get featurestore -n -w +``` + +### 2. Get your client configuration + +The operator creates a ConfigMap with connection info: + +```bash +kubectl get configmap feast--client -n \ + -o jsonpath='{.data.feature_store\.yaml}' +``` + +Save this as `feature_store.yaml` in your working directory. This config only +contains the **online store** endpoint (remote HTTP). For operations that need +registry access (`feast apply`, `get_historical_features`, `materialize`), you +need to build a full config — see the notebook example. + +### 3. Run the notebook + +Open `feast_example.ipynb` in your Kubeflow notebook and follow the steps. + +## Examples + +| File | Description | +|------|-------------| +| `featurestore.yaml` | FeatureStore CR to deploy in your namespace | +| `feature_store.yaml` | Client config template (fill in your namespace + registry) | +| `features.py` | Feature definitions — entities, sources, feature views | +| `feast_example.ipynb` | End-to-end notebook: define, apply, materialize, train, serve | + +## Architecture + +``` + ┌─────────────────────────────────┐ + │ Your Namespace │ + │ │ + feast apply ──────▶ MariaDB (registry) │ + │ - feature definitions │ + │ - entity schemas │ + │ │ + materialize ──────▶ SQLite on PVC (online store) │ + │ - latest feature values │ + │ │ + historical ──────▶ Parquet on PVC (offline store) │ + features │ - time-series feature data │ + │ │ + │ Feast Server (deployment) │ + │ - serves online features │ + └─────────────────────────────────┘ +``` + +- **Registry** (MariaDB): stores metadata — what features exist, their schemas, + data sources. Shared across all Feast processes in your namespace. +- **Online store** (SQLite/PVC): key-value store with the *latest* feature values + per entity. Updated by `feast materialize`. Used for real-time inference. +- **Offline store** (Dask/file/PVC): historical feature data in parquet files. + Used for training dataset generation with point-in-time correctness. diff --git a/feast/feast_example.ipynb b/feast/feast_example.ipynb new file mode 100644 index 0000000..7409687 --- /dev/null +++ b/feast/feast_example.ipynb @@ -0,0 +1,400 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Feast Feature Store — End-to-End Example\n", + "\n", + "This notebook walks through the full Feast workflow on prokube:\n", + "\n", + "1. Generate sample feature data\n", + "2. Configure Feast client\n", + "3. Register features in the registry (`feast apply`)\n", + "4. Retrieve historical features for training\n", + "5. Train a model and log to MLflow\n", + "6. Materialize features to the online store\n", + "7. Serve features online for inference\n", + "\n", + "## Prerequisites\n", + "\n", + "- A `FeatureStore` CR is deployed in your namespace (see `featurestore.yaml`)\n", + "- The `feast-registry-config` secret exists in your namespace" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!pip install -q feast scikit-learn mlflow" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Generate sample data\n", + "\n", + "We create a parquet file simulating hourly driver statistics over the past 7 days.\n", + "In a real scenario, this would come from your data pipeline." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import datetime\n", + "import os\n", + "\n", + "import numpy as np\n", + "import pandas as pd\n", + "\n", + "np.random.seed(42)\n", + "n = 1000\n", + "now = datetime.datetime.now()\n", + "timestamps = [now - datetime.timedelta(hours=i) for i in range(n)]\n", + "\n", + "driver_df = pd.DataFrame(\n", + " {\n", + " \"driver_id\": np.random.choice([1001, 1002, 1003, 1004, 1005], n),\n", + " \"event_timestamp\": timestamps,\n", + " \"conv_rate\": np.random.uniform(0.1, 1.0, n).astype(np.float32),\n", + " \"acc_rate\": np.random.uniform(0.5, 1.0, n).astype(np.float32),\n", + " \"avg_daily_trips\": np.random.randint(1, 50, n).astype(np.int64),\n", + " \"created\": timestamps,\n", + " }\n", + ")\n", + "\n", + "os.makedirs(\"data\", exist_ok=True)\n", + "driver_df.to_parquet(\"data/driver_stats.parquet\")\n", + "print(f\"Created {n} rows for {driver_df['driver_id'].nunique()} drivers\")\n", + "driver_df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Configure the Feast client\n", + "\n", + "The operator creates a ConfigMap with basic client config, but it only has the\n", + "online store endpoint. For full functionality (apply, materialize, historical\n", + "features) we need registry access too.\n", + "\n", + "We build `feature_store.yaml` by reading the registry URI from the\n", + "`feast-registry-config` secret." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import subprocess\n", + "\n", + "\n", + "def get_registry_uri():\n", + " \"\"\"Read the registry URI from the feast-registry-config secret.\n", + "\n", + " In a Kubeflow notebook pod, you can read secrets via kubectl (if RBAC allows)\n", + " or mount them as volumes. Adjust this for your environment.\n", + " \"\"\"\n", + " result = subprocess.run(\n", + " [\n", + " \"kubectl\",\n", + " \"get\",\n", + " \"secret\",\n", + " \"feast-registry-config\",\n", + " \"-o\",\n", + " \"jsonpath={.data.sql}\",\n", + " ],\n", + " capture_output=True,\n", + " text=True,\n", + " )\n", + " import base64\n", + "\n", + " decoded = base64.b64decode(result.stdout).decode()\n", + " # Format is \"path: mysql+pymysql://...\"\n", + " return decoded.replace(\"path: \", \"\").strip()\n", + "\n", + "\n", + "def get_namespace():\n", + " \"\"\"Get the current namespace from the service account.\"\"\"\n", + " try:\n", + " with open(\"/var/run/secrets/kubernetes.io/serviceaccount/namespace\") as f:\n", + " return f.read().strip()\n", + " except FileNotFoundError:\n", + " return subprocess.check_output(\n", + " [\"kubectl\", \"config\", \"view\", \"--minify\", \"-o\", \"jsonpath={..namespace}\"]\n", + " ).decode().strip()\n", + "\n", + "\n", + "NAMESPACE = get_namespace()\n", + "REGISTRY_URI = get_registry_uri()\n", + "FEAST_PROJECT = \"my_features\" # must match your FeatureStore CR's spec.feastProject\n", + "ONLINE_STORE_HOST = f\"feast-my-store-online.{NAMESPACE}.svc.cluster.local\"\n", + "\n", + "feature_store_yaml = f\"\"\"project: {FEAST_PROJECT}\n", + "provider: local\n", + "registry:\n", + " registry_type: sql\n", + " path: \\\"{REGISTRY_URI}\\\"\n", + " cache_ttl_seconds: 60\n", + "online_store:\n", + " path: http://{ONLINE_STORE_HOST}:80\n", + " type: remote\n", + "offline_store:\n", + " type: dask\n", + "auth:\n", + " type: no_auth\n", + "entity_key_serialization_version: 3\n", + "\"\"\"\n", + "\n", + "with open(\"feature_store.yaml\", \"w\") as f:\n", + " f.write(feature_store_yaml)\n", + "\n", + "print(\"feature_store.yaml written\")\n", + "print(f\"Namespace: {NAMESPACE}\")\n", + "print(f\"Registry: {REGISTRY_URI[:50]}...\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3. Register features\n", + "\n", + "`feast apply` reads `features.py` and registers the entity, data source, and\n", + "feature view definitions in the MariaDB registry." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!feast apply" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Verify what's registered\n", + "!feast feature-views list" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4. Retrieve historical features for training\n", + "\n", + "`get_historical_features` performs a **point-in-time join**: for each entity row,\n", + "it finds the most recent feature values as of that entity's timestamp. This\n", + "prevents data leakage — you only see features that were available at the time\n", + "the event occurred." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from feast import FeatureStore\n", + "\n", + "store = FeatureStore(repo_path=\".\")\n", + "\n", + "# Entity dataframe: \"give me features for these drivers at these timestamps\"\n", + "entity_df = pd.DataFrame(\n", + " {\n", + " \"driver_id\": [1001, 1002, 1003, 1004, 1005],\n", + " \"event_timestamp\": [now] * 5,\n", + " }\n", + ")\n", + "\n", + "training_df = store.get_historical_features(\n", + " entity_df=entity_df,\n", + " features=[\n", + " \"driver_hourly_stats:conv_rate\",\n", + " \"driver_hourly_stats:acc_rate\",\n", + " \"driver_hourly_stats:avg_daily_trips\",\n", + " ],\n", + ").to_df()\n", + "\n", + "print(\"Training data (point-in-time correct):\")\n", + "training_df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5. Train a model and log to MLflow\n", + "\n", + "Use the Feast-provided training data to train a simple model, then log\n", + "everything to MLflow — including which features were used." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import mlflow\n", + "from sklearn.linear_model import LinearRegression\n", + "from sklearn.metrics import mean_squared_error, r2_score\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "# MLflow tracking URI — adjust if your cluster uses a different location\n", + "mlflow.set_tracking_uri(\"http://mlflow-mlflow-tracking-server.mlflow.svc.cluster.local:80\")\n", + "mlflow.set_experiment(\"feast-driver-prediction\")\n", + "\n", + "FEATURE_COLS = [\"acc_rate\", \"avg_daily_trips\"]\n", + "TARGET = \"conv_rate\"\n", + "\n", + "X = training_df[FEATURE_COLS].fillna(0)\n", + "y = training_df[TARGET].fillna(0)\n", + "\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n", + "\n", + "with mlflow.start_run(run_name=\"feast-driver-conv-rate\") as run:\n", + " model = LinearRegression()\n", + " model.fit(X_train, y_train)\n", + " y_pred = model.predict(X_test)\n", + "\n", + " mse = mean_squared_error(y_test, y_pred)\n", + " r2 = r2_score(y_test, y_pred)\n", + "\n", + " # Log feature provenance — which Feast features fed this model\n", + " mlflow.log_param(\"feast_project\", FEAST_PROJECT)\n", + " mlflow.log_param(\"feast_feature_view\", \"driver_hourly_stats\")\n", + " mlflow.log_param(\"features\", \", \".join(FEATURE_COLS))\n", + " mlflow.log_param(\"target\", TARGET)\n", + " mlflow.log_param(\"n_training_samples\", len(X_train))\n", + "\n", + " mlflow.log_metric(\"mse\", mse)\n", + " mlflow.log_metric(\"r2_score\", r2)\n", + "\n", + " mlflow.sklearn.log_model(model, \"driver_conv_model\")\n", + "\n", + " print(f\"MLflow run: {run.info.run_id}\")\n", + " print(f\"MSE: {mse:.4f}, R2: {r2:.4f}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 6. Materialize features to the online store\n", + "\n", + "Materialization copies the latest feature values from the offline store (parquet)\n", + "into the online store (SQLite/remote server) so they can be served with low\n", + "latency for real-time inference.\n", + "\n", + "In production, you'd run this on a schedule (e.g., hourly cron job)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!feast materialize-incremental $(date -u +'%Y-%m-%dT%H:%M:%S')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 7. Online feature serving\n", + "\n", + "Retrieve the latest feature values for specific entities. This is what you'd\n", + "call at inference time — given a driver_id, get their current features to feed\n", + "into your model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "online_features = store.get_online_features(\n", + " features=[\n", + " \"driver_hourly_stats:conv_rate\",\n", + " \"driver_hourly_stats:acc_rate\",\n", + " \"driver_hourly_stats:avg_daily_trips\",\n", + " ],\n", + " entity_rows=[{\"driver_id\": 1001}, {\"driver_id\": 1002}],\n", + ").to_dict()\n", + "\n", + "print(\"Online features (latest values):\")\n", + "for k, v in online_features.items():\n", + " print(f\" {k}: {v}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Use online features for inference\n", + "import pandas as pd\n", + "\n", + "inference_df = pd.DataFrame(online_features)\n", + "predictions = model.predict(inference_df[FEATURE_COLS])\n", + "\n", + "for driver_id, pred in zip(inference_df[\"driver_id\"], predictions):\n", + " print(f\"Driver {driver_id}: predicted conv_rate = {pred:.4f}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Summary\n", + "\n", + "| Step | Command / API | Purpose |\n", + "|------|--------------|--------|\n", + "| Define features | `features.py` | Declare entities, sources, feature views |\n", + "| Register | `feast apply` | Write definitions to MariaDB registry |\n", + "| Training data | `store.get_historical_features()` | Point-in-time correct join |\n", + "| Materialize | `feast materialize-incremental` | Push latest values to online store |\n", + "| Online serving | `store.get_online_features()` | Low-latency lookup by entity key |\n", + "\n", + "### When to use Feast vs raw parquet\n", + "\n", + "- **Use Feast** when you need consistent feature definitions across training and\n", + " serving, point-in-time correctness, or online feature serving.\n", + "- **Use raw parquet** for one-off experiments where feature management overhead\n", + " isn't worth it." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.11.0" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/feast/feature_store.yaml b/feast/feature_store.yaml new file mode 100644 index 0000000..a45f9bf --- /dev/null +++ b/feast/feature_store.yaml @@ -0,0 +1,22 @@ +# Template — fill in your namespace and registry credentials. +# The operator-generated ConfigMap (feast--client) only has online_store +# config. For full functionality (apply, materialize, historical features), +# you need registry access too. +# +# Get your registry URI from the feast-registry-config secret: +# kubectl get secret feast-registry-config -n -o jsonpath='{.data.sql}' | base64 -d +project: my_features +provider: local +registry: + registry_type: sql + # paste your registry URI here (from feast-registry-config secret) + path: "mysql+pymysql://USER:PASSWORD@mariadb-feast.NAMESPACE.svc.cluster.local:3306/feast_NAMESPACE" + cache_ttl_seconds: 60 +online_store: + path: http://feast-my-store-online.NAMESPACE.svc.cluster.local:80 + type: remote +offline_store: + type: dask +auth: + type: no_auth +entity_key_serialization_version: 3 diff --git a/feast/features.py b/feast/features.py new file mode 100644 index 0000000..8116f89 --- /dev/null +++ b/feast/features.py @@ -0,0 +1,50 @@ +""" +Feast feature definitions for the driver stats example. + +This file defines: +- An entity (driver_id) identifying what we're tracking features for +- A data source (parquet file with historical driver data) +- A feature view (driver_hourly_stats) with three features + +To register these with the Feast registry: + feast apply + +To materialize features to the online store: + feast materialize-incremental $(date -u +"%Y-%m-%dT%H:%M:%S") +""" + +from datetime import timedelta + +from feast import Entity, FeatureView, Field, FileSource +from feast.types import Float32, Int64 + +# Entity: the "primary key" for feature lookups. +# When you request features, you provide entity values (e.g. driver_id=1001). +driver = Entity( + name="driver_id", + description="Unique driver identifier", +) + +# Data source: where historical feature data lives. +# This parquet file is generated by the notebook example. +driver_stats_source = FileSource( + path="data/driver_stats.parquet", + timestamp_field="event_timestamp", + created_timestamp_column="created", +) + +# Feature view: a logical group of features from one data source. +# - `ttl` controls how stale a feature can be before it's considered expired +# - `online=True` means features are materialized to the online store +driver_hourly_stats = FeatureView( + name="driver_hourly_stats", + entities=[driver], + ttl=timedelta(days=7), + schema=[ + Field(name="conv_rate", dtype=Float32), + Field(name="acc_rate", dtype=Float32), + Field(name="avg_daily_trips", dtype=Int64), + ], + source=driver_stats_source, + online=True, +) diff --git a/feast/featurestore.yaml b/feast/featurestore.yaml new file mode 100644 index 0000000..2afc7c7 --- /dev/null +++ b/feast/featurestore.yaml @@ -0,0 +1,45 @@ +# Example FeatureStore CR for prokube. +# Edit the namespace to match your Kubeflow profile. +# +# The operator will create: +# - A Feast deployment + service (online feature server) +# - PVCs for online and offline stores +# - A ConfigMap (feast--client) with client connection info +# +# Prerequisites: +# - feast-registry-config secret must exist in your namespace +# (provisioned by admin or the user-management operator) +apiVersion: feast.dev/v1 +kind: FeatureStore +metadata: + name: my-store + namespace: # <-- change this +spec: + feastProject: my_features + services: + registry: + local: + persistence: + store: + type: sql + secretRef: + name: feast-registry-config + onlineStore: + persistence: + file: + pvc: + create: + storageClassName: mayastor-no-redundancy # adjust for your cluster + resources: + requests: + storage: 5Gi + offlineStore: + persistence: + file: + type: dask + pvc: + create: + storageClassName: mayastor-no-redundancy # adjust for your cluster + resources: + requests: + storage: 10Gi From 0299af13d219fc4e04dee099866226ef4d4fa47f Mon Sep 17 00:00:00 2001 From: Igor Kvachenok Date: Wed, 29 Apr 2026 12:31:02 +0200 Subject: [PATCH 2/4] fix(feast): update to SQLite registry + Redis online store (v1 API) --- feast/README.md | 51 ++++++++++++++++++++++------------------ feast/feature_store.yaml | 27 ++++++++++----------- feast/featurestore.yaml | 37 ++++++++++++++++------------- 3 files changed, 61 insertions(+), 54 deletions(-) diff --git a/feast/README.md b/feast/README.md index b43f3e3..a672dfb 100644 --- a/feast/README.md +++ b/feast/README.md @@ -6,40 +6,44 @@ for feature management in ML workflows. ## Prerequisites - Feast must be enabled on your cluster (ask your admin) -- A `FeatureStore` CR must be created in your workspace (see below) -- The `feast-registry-config` secret must exist in your namespace +- A `feast-redis-config` secret must exist in your namespace (see step 1) +- A `FeatureStore` CR must be created in your workspace (see step 2) ## Quick Start -### 1. Create a FeatureStore in your workspace +### 1. Create the Redis secret -Apply the example CR (edit the namespace): +Ask your admin for the Redis host and password, then: ```bash -kubectl apply -f featurestore.yaml +kubectl create secret generic feast-redis-config \ + -n \ + --from-literal=redis='connection_string: ":6379,password="' ``` -Wait for it to become ready: +> **Note:** The secret key must be named `redis` and the value must be a YAML map. +> Use `host:port,password=...` format — **not** a `redis://` URI. + +### 2. Create a FeatureStore in your workspace + +Edit `featurestore.yaml` to set your namespace, then apply: ```bash -kubectl get featurestore -n -w +kubectl apply -f featurestore.yaml ``` -### 2. Get your client configuration - -The operator creates a ConfigMap with connection info: +Wait for it to become ready: ```bash -kubectl get configmap feast--client -n \ - -o jsonpath='{.data.feature_store\.yaml}' +kubectl get featurestore -n -w ``` -Save this as `feature_store.yaml` in your working directory. This config only -contains the **online store** endpoint (remote HTTP). For operations that need -registry access (`feast apply`, `get_historical_features`, `materialize`), you -need to build a full config — see the notebook example. +### 3. Configure your feature_store.yaml + +Edit `feature_store.yaml` with your Redis connection details. Use this file in +your notebooks or scripts to run `feast apply`, `materialize`, and retrieve features. -### 3. Run the notebook +### 4. Run the notebook Open `feast_example.ipynb` in your Kubeflow notebook and follow the steps. @@ -48,7 +52,7 @@ Open `feast_example.ipynb` in your Kubeflow notebook and follow the steps. | File | Description | |------|-------------| | `featurestore.yaml` | FeatureStore CR to deploy in your namespace | -| `feature_store.yaml` | Client config template (fill in your namespace + registry) | +| `feature_store.yaml` | Client config template (fill in Redis details) | | `features.py` | Feature definitions — entities, sources, feature views | | `feast_example.ipynb` | End-to-end notebook: define, apply, materialize, train, serve | @@ -58,12 +62,13 @@ Open `feast_example.ipynb` in your Kubeflow notebook and follow the steps. ┌─────────────────────────────────┐ │ Your Namespace │ │ │ - feast apply ──────▶ MariaDB (registry) │ + feast apply ──────▶ SQLite on PVC (registry) │ │ - feature definitions │ │ - entity schemas │ │ │ - materialize ──────▶ SQLite on PVC (online store) │ + materialize ──────▶ Redis (online store) │ │ - latest feature values │ + │ - sub-ms latency │ │ │ historical ──────▶ Parquet on PVC (offline store) │ features │ - time-series feature data │ @@ -73,9 +78,9 @@ Open `feast_example.ipynb` in your Kubeflow notebook and follow the steps. └─────────────────────────────────┘ ``` -- **Registry** (MariaDB): stores metadata — what features exist, their schemas, - data sources. Shared across all Feast processes in your namespace. -- **Online store** (SQLite/PVC): key-value store with the *latest* feature values +- **Registry** (SQLite/PVC): stores metadata — what features exist, their schemas, + data sources. Accessible from within your namespace. +- **Online store** (Redis): key-value store with the *latest* feature values per entity. Updated by `feast materialize`. Used for real-time inference. - **Offline store** (Dask/file/PVC): historical feature data in parquet files. Used for training dataset generation with point-in-time correctness. diff --git a/feast/feature_store.yaml b/feast/feature_store.yaml index a45f9bf..0d6b181 100644 --- a/feast/feature_store.yaml +++ b/feast/feature_store.yaml @@ -1,22 +1,19 @@ -# Template — fill in your namespace and registry credentials. -# The operator-generated ConfigMap (feast--client) only has online_store -# config. For full functionality (apply, materialize, historical features), -# you need registry access too. +# Template — fill in your Redis connection details. +# Get the Redis host and password from your admin. # -# Get your registry URI from the feast-registry-config secret: -# kubectl get secret feast-registry-config -n -o jsonpath='{.data.sql}' | base64 -d +# For workflows running inside the cluster, you can use /tmp/registry.db +# as the registry path (ephemeral, single-run). For persistent registry +# access, mount the registry PVC and use /data/registry/registry.db. project: my_features provider: local -registry: - registry_type: sql - # paste your registry URI here (from feast-registry-config secret) - path: "mysql+pymysql://USER:PASSWORD@mariadb-feast.NAMESPACE.svc.cluster.local:3306/feast_NAMESPACE" - cache_ttl_seconds: 60 -online_store: - path: http://feast-my-store-online.NAMESPACE.svc.cluster.local:80 - type: remote offline_store: - type: dask + type: file +online_store: + type: redis + connection_string: ":6379,password=" +registry: + registry_type: file + path: /tmp/registry.db auth: type: no_auth entity_key_serialization_version: 3 diff --git a/feast/featurestore.yaml b/feast/featurestore.yaml index 2afc7c7..2949beb 100644 --- a/feast/featurestore.yaml +++ b/feast/featurestore.yaml @@ -3,12 +3,11 @@ # # The operator will create: # - A Feast deployment + service (online feature server) -# - PVCs for online and offline stores +# - PVCs for the SQLite registry and offline data store # - A ConfigMap (feast--client) with client connection info # # Prerequisites: -# - feast-registry-config secret must exist in your namespace -# (provisioned by admin or the user-management operator) +# - feast-redis-config secret must exist in your namespace (see README) apiVersion: feast.dev/v1 kind: FeatureStore metadata: @@ -17,29 +16,35 @@ metadata: spec: feastProject: my_features services: + runFeastApplyOnInit: false + securityContext: + runAsUser: 0 registry: local: persistence: - store: - type: sql - secretRef: - name: feast-registry-config - onlineStore: - persistence: - file: - pvc: - create: - storageClassName: mayastor-no-redundancy # adjust for your cluster - resources: - requests: - storage: 5Gi + file: + pvc: + mountPath: /data/registry + create: + storageClassName: mayastor-no-redundancy # adjust for your cluster + resources: + requests: + storage: 1Gi offlineStore: persistence: file: type: dask pvc: + mountPath: /data/offline create: storageClassName: mayastor-no-redundancy # adjust for your cluster resources: requests: storage: 10Gi + onlineStore: + persistence: + store: + type: redis + secretRef: + name: feast-redis-config + secretKeyName: redis From 0af087b96b6d429fb3111985ed27363d6a7bf966 Mon Sep 17 00:00:00 2001 From: Igor Kvachenok Date: Wed, 29 Apr 2026 12:46:40 +0200 Subject: [PATCH 3/4] fix(feast): per-user Redis workflow, rename featurestore.yaml to feast-cr.yaml, rewrite notebook --- feast/README.md | 99 +++++---- feast/{featurestore.yaml => feast-cr.yaml} | 2 +- feast/feast_example.ipynb | 221 +++++++++++---------- 3 files changed, 178 insertions(+), 144 deletions(-) rename feast/{featurestore.yaml => feast-cr.yaml} (98%) diff --git a/feast/README.md b/feast/README.md index a672dfb..5db3a40 100644 --- a/feast/README.md +++ b/feast/README.md @@ -6,55 +6,77 @@ for feature management in ML workflows. ## Prerequisites - Feast must be enabled on your cluster (ask your admin) -- A `feast-redis-config` secret must exist in your namespace (see step 1) -- A `FeatureStore` CR must be created in your workspace (see step 2) +- You have `kubectl` access to your Kubeflow profile namespace ## Quick Start -### 1. Create the Redis secret +### 1. Deploy a Redis instance -Ask your admin for the Redis host and password, then: +Create a password secret and a Redis CR in your namespace: ```bash -kubectl create secret generic feast-redis-config \ +# Generate a random password +kubectl create secret generic redis-feast \ -n \ - --from-literal=redis='connection_string: ":6379,password="' -``` + --from-literal=password=$(openssl rand -base64 24 | tr -d '/') -> **Note:** The secret key must be named `redis` and the value must be a YAML map. -> Use `host:port,password=...` format — **not** a `redis://` URI. +# Deploy the Redis CR +kubectl apply -f redis-cr.yaml # see file below — edit namespace first +kubectl get redis -n -w +``` -### 2. Create a FeatureStore in your workspace +`redis-cr.yaml` is a plain Kubernetes manifest for the OpsTree Redis operator. +Create it with the contents shown in the prokube [user docs](https://docs.prokube.ai/user_docs/feast/). -Edit `featurestore.yaml` to set your namespace, then apply: +### 2. Create the Feast Redis secret ```bash -kubectl apply -f featurestore.yaml +NAMESPACE= +PASSWORD=$(kubectl get secret redis-feast -n $NAMESPACE \ + -o jsonpath='{.data.password}' | base64 -d) + +cat > /tmp/redis-config.yaml << EOF +connection_string: "redis-feast.${NAMESPACE}.svc.cluster.local:6379,password=${PASSWORD}" +EOF + +kubectl create secret generic feast-redis-config \ + -n $NAMESPACE \ + --from-file=redis=/tmp/redis-config.yaml + +rm /tmp/redis-config.yaml ``` -Wait for it to become ready: +### 3. Deploy a FeatureStore + +Edit `feast-cr.yaml` to set your namespace, then: ```bash -kubectl get featurestore -n -w +kubectl apply -f feast-cr.yaml +kubectl get featurestore -n -w # wait until Ready ``` -### 3. Configure your feature_store.yaml +### 4. Run the notebook + +Open `feast_example.ipynb` in your Kubeflow notebook. The notebook reads the +`feast-redis-config` secret automatically and builds `feature_store.yaml` for you. -Edit `feature_store.yaml` with your Redis connection details. Use this file in -your notebooks or scripts to run `feast apply`, `materialize`, and retrieve features. +## Files -### 4. Run the notebook +| File | What it is | +|------|------------| +| `feast-cr.yaml` | Kubernetes manifest — deploys the FeatureStore CR | +| `feature_store.yaml` | Feast SDK config — tells the Python client where registry and stores are | +| `features.py` | Feature definitions — entities, data sources, feature views | +| `feast_example.ipynb` | End-to-end notebook: generate data, apply, train, materialize, serve | -Open `feast_example.ipynb` in your Kubeflow notebook and follow the steps. +### Why two YAML files? -## Examples +`feast-cr.yaml` is a **Kubernetes resource** (`kind: FeatureStore`) that the operator +reads to provision PVCs and the Feast server pod. You apply it once with `kubectl`. -| File | Description | -|------|-------------| -| `featurestore.yaml` | FeatureStore CR to deploy in your namespace | -| `feature_store.yaml` | Client config template (fill in Redis details) | -| `features.py` | Feature definitions — entities, sources, feature views | -| `feast_example.ipynb` | End-to-end notebook: define, apply, materialize, train, serve | +`feature_store.yaml` is a **Feast SDK config file** (fixed filename — Feast convention) +that the Python client and CLI read to know how to connect to the registry and stores. +You use it in notebooks and scripts. ## Architecture @@ -62,25 +84,28 @@ Open `feast_example.ipynb` in your Kubeflow notebook and follow the steps. ┌─────────────────────────────────┐ │ Your Namespace │ │ │ - feast apply ──────▶ SQLite on PVC (registry) │ - │ - feature definitions │ + │ Redis CR (redis-feast) │ + │ - your private Redis instance │ + │ │ + feast apply ──────▶ SQLite /tmp/registry.db │ + (notebook) │ - feature definitions │ │ - entity schemas │ │ │ - materialize ──────▶ Redis (online store) │ + materialize ──────▶ Redis online store │ │ - latest feature values │ │ - sub-ms latency │ + │ - persistent across sessions │ │ │ historical ──────▶ Parquet on PVC (offline store) │ features │ - time-series feature data │ │ │ - │ Feast Server (deployment) │ - │ - serves online features │ + │ Feast Server pod │ + │ - HTTP API for online features │ + │ - registry on PVC (/data/...) │ └─────────────────────────────────┘ ``` -- **Registry** (SQLite/PVC): stores metadata — what features exist, their schemas, - data sources. Accessible from within your namespace. -- **Online store** (Redis): key-value store with the *latest* feature values - per entity. Updated by `feast materialize`. Used for real-time inference. -- **Offline store** (Dask/file/PVC): historical feature data in parquet files. - Used for training dataset generation with point-in-time correctness. +- **Redis** (per-namespace): your private online store. You own and manage it. +- **Registry** (SQLite): feature definitions. In notebook workflows, uses `/tmp/registry.db`. + The Feast server pod uses the registry PVC at `/data/registry/registry.db`. +- **Offline store** (parquet/PVC): historical feature data for training. diff --git a/feast/featurestore.yaml b/feast/feast-cr.yaml similarity index 98% rename from feast/featurestore.yaml rename to feast/feast-cr.yaml index 2949beb..7aa27fb 100644 --- a/feast/featurestore.yaml +++ b/feast/feast-cr.yaml @@ -33,7 +33,7 @@ spec: offlineStore: persistence: file: - type: dask + type: file pvc: mountPath: /data/offline create: diff --git a/feast/feast_example.ipynb b/feast/feast_example.ipynb index 7409687..1bd09a6 100644 --- a/feast/feast_example.ipynb +++ b/feast/feast_example.ipynb @@ -2,6 +2,7 @@ "cells": [ { "cell_type": "markdown", + "id": "1eaa631f", "metadata": {}, "source": [ "# Feast Feature Store — End-to-End Example\n", @@ -9,22 +10,23 @@ "This notebook walks through the full Feast workflow on prokube:\n", "\n", "1. Generate sample feature data\n", - "2. Configure Feast client\n", + "2. Configure the Feast client\n", "3. Register features in the registry (`feast apply`)\n", "4. Retrieve historical features for training\n", "5. Train a model and log to MLflow\n", - "6. Materialize features to the online store\n", + "6. Materialize features to the Redis online store\n", "7. Serve features online for inference\n", "\n", "## Prerequisites\n", "\n", - "- A `FeatureStore` CR is deployed in your namespace (see `featurestore.yaml`)\n", - "- The `feast-registry-config` secret exists in your namespace" + "- A `FeatureStore` CR is deployed in your namespace (see `feast-cr.yaml`)\n", + "- The `feast-redis-config` secret exists in your namespace (ask your admin)\n" ] }, { "cell_type": "code", "execution_count": null, + "id": "4b8f4c32", "metadata": {}, "outputs": [], "source": [ @@ -33,17 +35,19 @@ }, { "cell_type": "markdown", + "id": "3956d1d1", "metadata": {}, "source": [ "## 1. Generate sample data\n", "\n", "We create a parquet file simulating hourly driver statistics over the past 7 days.\n", - "In a real scenario, this would come from your data pipeline." + "In a real scenario this would come from your data pipeline.\n" ] }, { "cell_type": "code", "execution_count": null, + "id": "f2b288da", "metadata": {}, "outputs": [], "source": [ @@ -58,73 +62,50 @@ "now = datetime.datetime.now()\n", "timestamps = [now - datetime.timedelta(hours=i) for i in range(n)]\n", "\n", - "driver_df = pd.DataFrame(\n", - " {\n", - " \"driver_id\": np.random.choice([1001, 1002, 1003, 1004, 1005], n),\n", - " \"event_timestamp\": timestamps,\n", - " \"conv_rate\": np.random.uniform(0.1, 1.0, n).astype(np.float32),\n", - " \"acc_rate\": np.random.uniform(0.5, 1.0, n).astype(np.float32),\n", - " \"avg_daily_trips\": np.random.randint(1, 50, n).astype(np.int64),\n", - " \"created\": timestamps,\n", - " }\n", - ")\n", + "driver_df = pd.DataFrame({\n", + " \"driver_id\": np.random.choice([1001, 1002, 1003, 1004, 1005], n),\n", + " \"event_timestamp\": timestamps,\n", + " \"conv_rate\": np.random.uniform(0.1, 1.0, n).astype(np.float32),\n", + " \"acc_rate\": np.random.uniform(0.5, 1.0, n).astype(np.float32),\n", + " \"avg_daily_trips\": np.random.randint(1, 50, n).astype(np.int64),\n", + " \"created\": timestamps,\n", + "})\n", "\n", "os.makedirs(\"data\", exist_ok=True)\n", "driver_df.to_parquet(\"data/driver_stats.parquet\")\n", "print(f\"Created {n} rows for {driver_df['driver_id'].nunique()} drivers\")\n", - "driver_df.head()" + "driver_df.head()\n" ] }, { "cell_type": "markdown", + "id": "ef7e1942", "metadata": {}, "source": [ "## 2. Configure the Feast client\n", "\n", - "The operator creates a ConfigMap with basic client config, but it only has the\n", - "online store endpoint. For full functionality (apply, materialize, historical\n", - "features) we need registry access too.\n", + "We build `feature_store.yaml` by reading the Redis connection string from the\n", + "`feast-redis-config` secret in our namespace.\n", "\n", - "We build `feature_store.yaml` by reading the registry URI from the\n", - "`feast-registry-config` secret." + "The registry uses a local SQLite file (`/tmp/registry.db`). This is ephemeral\n", + "within the notebook session — re-run `feast apply` at the start of each session\n", + "to repopulate it. The Redis online store is persistent across sessions.\n" ] }, { "cell_type": "code", "execution_count": null, + "id": "a8804025", "metadata": {}, "outputs": [], "source": [ + "import base64\n", "import subprocess\n", - "\n", - "\n", - "def get_registry_uri():\n", - " \"\"\"Read the registry URI from the feast-registry-config secret.\n", - "\n", - " In a Kubeflow notebook pod, you can read secrets via kubectl (if RBAC allows)\n", - " or mount them as volumes. Adjust this for your environment.\n", - " \"\"\"\n", - " result = subprocess.run(\n", - " [\n", - " \"kubectl\",\n", - " \"get\",\n", - " \"secret\",\n", - " \"feast-registry-config\",\n", - " \"-o\",\n", - " \"jsonpath={.data.sql}\",\n", - " ],\n", - " capture_output=True,\n", - " text=True,\n", - " )\n", - " import base64\n", - "\n", - " decoded = base64.b64decode(result.stdout).decode()\n", - " # Format is \"path: mysql+pymysql://...\"\n", - " return decoded.replace(\"path: \", \"\").strip()\n", + "import yaml\n", "\n", "\n", "def get_namespace():\n", - " \"\"\"Get the current namespace from the service account.\"\"\"\n", + " \"\"\"Read the current namespace from the pod's service account.\"\"\"\n", " try:\n", " with open(\"/var/run/secrets/kubernetes.io/serviceaccount/namespace\") as f:\n", " return f.read().strip()\n", @@ -134,48 +115,65 @@ " ).decode().strip()\n", "\n", "\n", + "def get_redis_connection_string():\n", + " \"\"\"Read the Redis connection string from the feast-redis-config secret.\n", + "\n", + " The secret has key 'redis' whose value is a YAML map:\n", + " connection_string: \"host:port,password=...\"\n", + " \"\"\"\n", + " result = subprocess.run(\n", + " [\"kubectl\", \"get\", \"secret\", \"feast-redis-config\",\n", + " \"-o\", \"jsonpath={.data.redis}\"],\n", + " capture_output=True, text=True, check=True,\n", + " )\n", + " raw = base64.b64decode(result.stdout).decode()\n", + " return yaml.safe_load(raw)[\"connection_string\"]\n", + "\n", + "\n", "NAMESPACE = get_namespace()\n", - "REGISTRY_URI = get_registry_uri()\n", - "FEAST_PROJECT = \"my_features\" # must match your FeatureStore CR's spec.feastProject\n", - "ONLINE_STORE_HOST = f\"feast-my-store-online.{NAMESPACE}.svc.cluster.local\"\n", - "\n", - "feature_store_yaml = f\"\"\"project: {FEAST_PROJECT}\n", - "provider: local\n", - "registry:\n", - " registry_type: sql\n", - " path: \\\"{REGISTRY_URI}\\\"\n", - " cache_ttl_seconds: 60\n", - "online_store:\n", - " path: http://{ONLINE_STORE_HOST}:80\n", - " type: remote\n", - "offline_store:\n", - " type: dask\n", - "auth:\n", - " type: no_auth\n", - "entity_key_serialization_version: 3\n", - "\"\"\"\n", + "REDIS_CONNECTION_STRING = get_redis_connection_string()\n", + "FEAST_PROJECT = \"my_features\" # must match spec.feastProject in your FeatureStore CR\n", + "\n", + "feature_store_yaml = (\n", + " f\"project: {FEAST_PROJECT}\\n\"\n", + " \"provider: local\\n\"\n", + " \"offline_store:\\n\"\n", + " \" type: file\\n\"\n", + " \"online_store:\\n\"\n", + " \" type: redis\\n\"\n", + " f\" connection_string: \\\"{REDIS_CONNECTION_STRING}\\\"\\n\"\n", + " \"registry:\\n\"\n", + " \" registry_type: file\\n\"\n", + " \" path: /tmp/registry.db\\n\"\n", + " \"auth:\\n\"\n", + " \" type: no_auth\\n\"\n", + " \"entity_key_serialization_version: 3\\n\"\n", + ")\n", "\n", "with open(\"feature_store.yaml\", \"w\") as f:\n", " f.write(feature_store_yaml)\n", "\n", "print(\"feature_store.yaml written\")\n", "print(f\"Namespace: {NAMESPACE}\")\n", - "print(f\"Registry: {REGISTRY_URI[:50]}...\")" + "# Print host:port only — hide password\n", + "print(f\"Redis: {REDIS_CONNECTION_STRING.split(',')[0]}\")\n" ] }, { "cell_type": "markdown", + "id": "358c2624", "metadata": {}, "source": [ "## 3. Register features\n", "\n", - "`feast apply` reads `features.py` and registers the entity, data source, and\n", - "feature view definitions in the MariaDB registry." + "`feast apply` reads `features.py` and writes the entity, data source, and\n", + "feature view definitions to the local SQLite registry (`/tmp/registry.db`).\n" ] }, { "cell_type": "code", "execution_count": null, + "id": "52ecbcf6", "metadata": {}, "outputs": [], "source": [ @@ -185,28 +183,31 @@ { "cell_type": "code", "execution_count": null, + "id": "10eb1055", "metadata": {}, "outputs": [], "source": [ - "# Verify what's registered\n", + "# Verify what was registered\n", "!feast feature-views list" ] }, { "cell_type": "markdown", + "id": "c7f35090", "metadata": {}, "source": [ "## 4. Retrieve historical features for training\n", "\n", "`get_historical_features` performs a **point-in-time join**: for each entity row,\n", "it finds the most recent feature values as of that entity's timestamp. This\n", - "prevents data leakage — you only see features that were available at the time\n", - "the event occurred." + "prevents data leakage — you only see features that were available when the\n", + "event occurred.\n" ] }, { "cell_type": "code", "execution_count": null, + "id": "7796c5fc", "metadata": {}, "outputs": [], "source": [ @@ -214,13 +215,10 @@ "\n", "store = FeatureStore(repo_path=\".\")\n", "\n", - "# Entity dataframe: \"give me features for these drivers at these timestamps\"\n", - "entity_df = pd.DataFrame(\n", - " {\n", - " \"driver_id\": [1001, 1002, 1003, 1004, 1005],\n", - " \"event_timestamp\": [now] * 5,\n", - " }\n", - ")\n", + "entity_df = pd.DataFrame({\n", + " \"driver_id\": [1001, 1002, 1003, 1004, 1005],\n", + " \"event_timestamp\": [now] * 5,\n", + "})\n", "\n", "training_df = store.get_historical_features(\n", " entity_df=entity_df,\n", @@ -232,22 +230,24 @@ ").to_df()\n", "\n", "print(\"Training data (point-in-time correct):\")\n", - "training_df" + "training_df\n" ] }, { "cell_type": "markdown", + "id": "5e15eee6", "metadata": {}, "source": [ "## 5. Train a model and log to MLflow\n", "\n", "Use the Feast-provided training data to train a simple model, then log\n", - "everything to MLflow — including which features were used." + "everything to MLflow — including which Feast features were used.\n" ] }, { "cell_type": "code", "execution_count": null, + "id": "61f19018", "metadata": {}, "outputs": [], "source": [ @@ -256,7 +256,7 @@ "from sklearn.metrics import mean_squared_error, r2_score\n", "from sklearn.model_selection import train_test_split\n", "\n", - "# MLflow tracking URI — adjust if your cluster uses a different location\n", + "# Adjust the tracking URI if your cluster uses a different location\n", "mlflow.set_tracking_uri(\"http://mlflow-mlflow-tracking-server.mlflow.svc.cluster.local:80\")\n", "mlflow.set_experiment(\"feast-driver-prediction\")\n", "\n", @@ -276,38 +276,36 @@ " mse = mean_squared_error(y_test, y_pred)\n", " r2 = r2_score(y_test, y_pred)\n", "\n", - " # Log feature provenance — which Feast features fed this model\n", " mlflow.log_param(\"feast_project\", FEAST_PROJECT)\n", " mlflow.log_param(\"feast_feature_view\", \"driver_hourly_stats\")\n", " mlflow.log_param(\"features\", \", \".join(FEATURE_COLS))\n", " mlflow.log_param(\"target\", TARGET)\n", " mlflow.log_param(\"n_training_samples\", len(X_train))\n", - "\n", " mlflow.log_metric(\"mse\", mse)\n", " mlflow.log_metric(\"r2_score\", r2)\n", - "\n", " mlflow.sklearn.log_model(model, \"driver_conv_model\")\n", "\n", " print(f\"MLflow run: {run.info.run_id}\")\n", - " print(f\"MSE: {mse:.4f}, R2: {r2:.4f}\")" + " print(f\"MSE: {mse:.4f}, R2: {r2:.4f}\")\n" ] }, { "cell_type": "markdown", + "id": "51b15c4b", "metadata": {}, "source": [ - "## 6. Materialize features to the online store\n", + "## 6. Materialize features to Redis\n", "\n", - "Materialization copies the latest feature values from the offline store (parquet)\n", - "into the online store (SQLite/remote server) so they can be served with low\n", - "latency for real-time inference.\n", + "Materialization reads the latest feature values from the offline parquet store\n", + "and writes them to Redis for low-latency online serving.\n", "\n", - "In production, you'd run this on a schedule (e.g., hourly cron job)." + "In production you would run this on a schedule (e.g. hourly cron job).\n" ] }, { "cell_type": "code", "execution_count": null, + "id": "6183ced7", "metadata": {}, "outputs": [], "source": [ @@ -316,18 +314,22 @@ }, { "cell_type": "markdown", + "id": "66ec423c", "metadata": {}, "source": [ "## 7. Online feature serving\n", "\n", - "Retrieve the latest feature values for specific entities. This is what you'd\n", - "call at inference time — given a driver_id, get their current features to feed\n", - "into your model." + "Retrieve the latest feature values for specific entities. The Feast SDK reads\n", + "directly from Redis — no round-trip through the Feast server pod is needed.\n", + "\n", + "This is what you call at inference time: given a `driver_id`, get their\n", + "current features to feed into the model.\n" ] }, { "cell_type": "code", "execution_count": null, + "id": "6982cdba", "metadata": {}, "outputs": [], "source": [ @@ -340,29 +342,29 @@ " entity_rows=[{\"driver_id\": 1001}, {\"driver_id\": 1002}],\n", ").to_dict()\n", "\n", - "print(\"Online features (latest values):\")\n", + "print(\"Online features (latest values from Redis):\")\n", "for k, v in online_features.items():\n", - " print(f\" {k}: {v}\")" + " print(f\" {k}: {v}\")\n" ] }, { "cell_type": "code", "execution_count": null, + "id": "72e36e86", "metadata": {}, "outputs": [], "source": [ "# Use online features for inference\n", - "import pandas as pd\n", - "\n", "inference_df = pd.DataFrame(online_features)\n", "predictions = model.predict(inference_df[FEATURE_COLS])\n", "\n", "for driver_id, pred in zip(inference_df[\"driver_id\"], predictions):\n", - " print(f\"Driver {driver_id}: predicted conv_rate = {pred:.4f}\")" + " print(f\"Driver {driver_id}: predicted conv_rate = {pred:.4f}\")\n" ] }, { "cell_type": "markdown", + "id": "72a52751", "metadata": {}, "source": [ "## Summary\n", @@ -370,17 +372,24 @@ "| Step | Command / API | Purpose |\n", "|------|--------------|--------|\n", "| Define features | `features.py` | Declare entities, sources, feature views |\n", - "| Register | `feast apply` | Write definitions to MariaDB registry |\n", - "| Training data | `store.get_historical_features()` | Point-in-time correct join |\n", - "| Materialize | `feast materialize-incremental` | Push latest values to online store |\n", - "| Online serving | `store.get_online_features()` | Low-latency lookup by entity key |\n", + "| Register | `feast apply` | Write definitions to local SQLite registry |\n", + "| Training data | `store.get_historical_features()` | Point-in-time correct join from parquet |\n", + "| Materialize | `feast materialize-incremental` | Push latest values to Redis online store |\n", + "| Online serving | `store.get_online_features()` | Sub-ms lookup from Redis by entity key |\n", "\n", "### When to use Feast vs raw parquet\n", "\n", "- **Use Feast** when you need consistent feature definitions across training and\n", - " serving, point-in-time correctness, or online feature serving.\n", + " serving, point-in-time correctness, or low-latency online feature serving.\n", "- **Use raw parquet** for one-off experiments where feature management overhead\n", - " isn't worth it." + " is not worth it.\n", + "\n", + "### Note on the SQLite registry\n", + "\n", + "This notebook uses `/tmp/registry.db` as the registry, which is local to the\n", + "notebook session. Re-run the \"Configure\" and `feast apply` cells at the start\n", + "of each new session. The Redis online store is persistent across sessions —\n", + "features materialized in one session are still available in subsequent ones.\n" ] } ], @@ -396,5 +405,5 @@ } }, "nbformat": 4, - "nbformat_minor": 4 + "nbformat_minor": 5 } From 2091135da94ec1b6a8aa7f54467b8b68af9654cf Mon Sep 17 00:00:00 2001 From: Igor Kvachenok Date: Wed, 29 Apr 2026 12:48:03 +0200 Subject: [PATCH 4/4] docs(feast): add store backends and recommendations table to README --- feast/README.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/feast/README.md b/feast/README.md index 5db3a40..c2fcb7e 100644 --- a/feast/README.md +++ b/feast/README.md @@ -80,6 +80,18 @@ You use it in notebooks and scripts. ## Architecture +Feast has three stores. Here is what each one does and which backend prokube uses: + +| Store | Purpose | Prokube default | Alternatives | +|-------|---------|-----------------|--------------| +| **Registry** | Stores feature definitions (entities, feature views, sources). Written on `feast apply`, read at startup. | SQLite on PVC | SQL databases (PostgreSQL, etc.) for multi-replica or shared setups | +| **Online store** | Holds the *latest* feature value per entity. Read on every inference request — latency critical. | Redis (your `Redis` CR) | SQLite on PVC (dev/test only; not multi-replica safe) | +| **Offline store** | Historical feature records for point-in-time joins during training. Batch workload, not on serving path. | Parquet/file on PVC | Dask (same parquet files, distributed compute — use only if data exceeds pod memory); cloud warehouses (BigQuery, Snowflake, Redshift) | + +The offline store default is `type: file` (pandas). You can switch to `type: dask` in +`feast-cr.yaml` if your datasets are too large to fit in memory, but it adds complexity +and is rarely needed. + ``` ┌─────────────────────────────────┐ │ Your Namespace │