From 8abe8ceaf70b72391dc644bd16a7c8a0e48ae692 Mon Sep 17 00:00:00 2001 From: Harizo Rajaona Date: Wed, 4 Nov 2020 16:55:36 +0100 Subject: [PATCH 01/14] Started refactoring repository structure --- README.md | 20 +++++++++++++++++++ .../index-all-hive-databases/README.md | 0 .../index-all-hive-databases.py | 0 .../client_api_utils}/README.md | 0 ...acefully_provited_hive_legacy_migration.py | 0 .../project_bundle_deployment/README.md | 0 .../manage_bundles.py | 0 .../compute_partition_list}/README.md | 0 .../compute_partition_list.py | 0 .../custom_python_models}/README.md | 0 .../custom_python_models}/lightgbm.py | 0 .../dataset_last_run_job_info}/README.md | 0 .../get_job_info_for_datasets_in_project.py | 0 .../partition_list_variable}/README.md | 0 .../build_only_new_partitions.py | 0 .../build_whole_output_custom.py | 0 .../python_io_examples}/README.md | 0 .../pandas_chunked_read_write.py | 0 {reco => _old/reco}/README.md | 0 {reco => _old/reco}/__init__.py | 0 {reco => _old/reco}/surprise_wrapper.py | 0 .../authenticate-calls/README.md | 0 .../flask-webapps/authenticate-calls/app.js | 0 .../authenticate-calls/backend.py | 0 .../flask-session-per-browser/README.md | 0 .../flask-session-per-browser/app.js | 0 .../flask-session-per-browser/backend.py | 0 .../flask-session-per-browser/body.html | 0 .../flask-session-per-frontend/README.md | 0 .../flask-session-per-frontend/app.js | 0 .../flask-session-per-frontend/backend.py | 0 .../flask-session-per-frontend/body.html | 0 .../flask-session-per-user/README.md | 0 .../flask-session-per-user/app.js | 0 .../flask-session-per-user/backend.py | 0 .../flask-session-per-user/body.html | 0 .../shiny/authenticate-calls/README.md | 0 .../shiny/authenticate-calls/server.R | 0 .../shiny/authenticate-calls/ui.R | 0 .../shiny/shiny-and-dygraphs/README.md | 0 .../shiny/shiny-and-dygraphs/UI.R | 0 .../shiny/shiny-and-dygraphs/server.R | 0 admin/README.md | 10 ++++++++++ applications/README.md | 4 ++++ datasets/README.md | 10 ++++++++++ machine_learning/README.md | 8 ++++++++ metrics_and_checks/README.md | 4 ++++ scenarios/README.md | 3 +++ statistics/README.md | 3 +++ webapps/README.md | 7 +++++++ 50 files changed, 69 insertions(+) rename {administration => _old/administration}/index-all-hive-databases/README.md (100%) rename {administration => _old/administration}/index-all-hive-databases/index-all-hive-databases.py (100%) rename {client_api_utils => _old/client_api_utils}/README.md (100%) rename {client_api_utils => _old/client_api_utils}/hive_config_migration/gracefully_provited_hive_legacy_migration.py (100%) rename {client_api_utils => _old/client_api_utils}/project_bundle_deployment/README.md (100%) rename {client_api_utils => _old/client_api_utils}/project_bundle_deployment/manage_bundles.py (100%) rename {compute_partition_list => _old/compute_partition_list}/README.md (100%) rename {compute_partition_list => _old/compute_partition_list}/compute_partition_list.py (100%) rename {custom_python_models => _old/custom_python_models}/README.md (100%) rename {custom_python_models => _old/custom_python_models}/lightgbm.py (100%) rename {dataset_last_run_job_info => _old/dataset_last_run_job_info}/README.md (100%) rename {dataset_last_run_job_info => _old/dataset_last_run_job_info}/get_job_info_for_datasets_in_project.py (100%) rename {partition_list_variable => _old/partition_list_variable}/README.md (100%) rename {partition_list_variable => _old/partition_list_variable}/build_only_new_partitions.py (100%) rename {partition_list_variable => _old/partition_list_variable}/build_whole_output_custom.py (100%) rename {python_io_examples => _old/python_io_examples}/README.md (100%) rename {python_io_examples => _old/python_io_examples}/pandas_chunked_read_write.py (100%) rename {reco => _old/reco}/README.md (100%) rename {reco => _old/reco}/__init__.py (100%) rename {reco => _old/reco}/surprise_wrapper.py (100%) rename {visualization => _old/visualization}/flask-webapps/authenticate-calls/README.md (100%) rename {visualization => _old/visualization}/flask-webapps/authenticate-calls/app.js (100%) rename {visualization => _old/visualization}/flask-webapps/authenticate-calls/backend.py (100%) rename {visualization => _old/visualization}/flask-webapps/flask-session-per-browser/README.md (100%) rename {visualization => _old/visualization}/flask-webapps/flask-session-per-browser/app.js (100%) rename {visualization => _old/visualization}/flask-webapps/flask-session-per-browser/backend.py (100%) rename {visualization => _old/visualization}/flask-webapps/flask-session-per-browser/body.html (100%) rename {visualization => _old/visualization}/flask-webapps/flask-session-per-frontend/README.md (100%) rename {visualization => _old/visualization}/flask-webapps/flask-session-per-frontend/app.js (100%) rename {visualization => _old/visualization}/flask-webapps/flask-session-per-frontend/backend.py (100%) rename {visualization => _old/visualization}/flask-webapps/flask-session-per-frontend/body.html (100%) rename {visualization => _old/visualization}/flask-webapps/flask-session-per-user/README.md (100%) rename {visualization => _old/visualization}/flask-webapps/flask-session-per-user/app.js (100%) rename {visualization => _old/visualization}/flask-webapps/flask-session-per-user/backend.py (100%) rename {visualization => _old/visualization}/flask-webapps/flask-session-per-user/body.html (100%) rename {visualization => _old/visualization}/shiny/authenticate-calls/README.md (100%) rename {visualization => _old/visualization}/shiny/authenticate-calls/server.R (100%) rename {visualization => _old/visualization}/shiny/authenticate-calls/ui.R (100%) rename {visualization => _old/visualization}/shiny/shiny-and-dygraphs/README.md (100%) rename {visualization => _old/visualization}/shiny/shiny-and-dygraphs/UI.R (100%) rename {visualization => _old/visualization}/shiny/shiny-and-dygraphs/server.R (100%) create mode 100644 admin/README.md create mode 100644 applications/README.md create mode 100644 datasets/README.md create mode 100644 machine_learning/README.md create mode 100644 metrics_and_checks/README.md create mode 100644 scenarios/README.md create mode 100644 statistics/README.md create mode 100644 webapps/README.md diff --git a/README.md b/README.md index 8ee7c6e..d5e8daa 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,22 @@ # dss-code-samples Various code samples for using DSS + +## Refactoring + +### Structure + +``` +dss-code-samples +|_admin +|_applications +|_datasets +|_formulas +|_metrics_and_checks +|_machine_learning +|_partitioning +|_scenarios +|_statistics +|_webapps +``` + + diff --git a/administration/index-all-hive-databases/README.md b/_old/administration/index-all-hive-databases/README.md similarity index 100% rename from administration/index-all-hive-databases/README.md rename to _old/administration/index-all-hive-databases/README.md diff --git a/administration/index-all-hive-databases/index-all-hive-databases.py b/_old/administration/index-all-hive-databases/index-all-hive-databases.py similarity index 100% rename from administration/index-all-hive-databases/index-all-hive-databases.py rename to _old/administration/index-all-hive-databases/index-all-hive-databases.py diff --git a/client_api_utils/README.md b/_old/client_api_utils/README.md similarity index 100% rename from client_api_utils/README.md rename to _old/client_api_utils/README.md diff --git a/client_api_utils/hive_config_migration/gracefully_provited_hive_legacy_migration.py b/_old/client_api_utils/hive_config_migration/gracefully_provited_hive_legacy_migration.py similarity index 100% rename from client_api_utils/hive_config_migration/gracefully_provited_hive_legacy_migration.py rename to _old/client_api_utils/hive_config_migration/gracefully_provited_hive_legacy_migration.py diff --git a/client_api_utils/project_bundle_deployment/README.md b/_old/client_api_utils/project_bundle_deployment/README.md similarity index 100% rename from client_api_utils/project_bundle_deployment/README.md rename to _old/client_api_utils/project_bundle_deployment/README.md diff --git a/client_api_utils/project_bundle_deployment/manage_bundles.py b/_old/client_api_utils/project_bundle_deployment/manage_bundles.py similarity index 100% rename from client_api_utils/project_bundle_deployment/manage_bundles.py rename to _old/client_api_utils/project_bundle_deployment/manage_bundles.py diff --git a/compute_partition_list/README.md b/_old/compute_partition_list/README.md similarity index 100% rename from compute_partition_list/README.md rename to _old/compute_partition_list/README.md diff --git a/compute_partition_list/compute_partition_list.py b/_old/compute_partition_list/compute_partition_list.py similarity index 100% rename from compute_partition_list/compute_partition_list.py rename to _old/compute_partition_list/compute_partition_list.py diff --git a/custom_python_models/README.md b/_old/custom_python_models/README.md similarity index 100% rename from custom_python_models/README.md rename to _old/custom_python_models/README.md diff --git a/custom_python_models/lightgbm.py b/_old/custom_python_models/lightgbm.py similarity index 100% rename from custom_python_models/lightgbm.py rename to _old/custom_python_models/lightgbm.py diff --git a/dataset_last_run_job_info/README.md b/_old/dataset_last_run_job_info/README.md similarity index 100% rename from dataset_last_run_job_info/README.md rename to _old/dataset_last_run_job_info/README.md diff --git a/dataset_last_run_job_info/get_job_info_for_datasets_in_project.py b/_old/dataset_last_run_job_info/get_job_info_for_datasets_in_project.py similarity index 100% rename from dataset_last_run_job_info/get_job_info_for_datasets_in_project.py rename to _old/dataset_last_run_job_info/get_job_info_for_datasets_in_project.py diff --git a/partition_list_variable/README.md b/_old/partition_list_variable/README.md similarity index 100% rename from partition_list_variable/README.md rename to _old/partition_list_variable/README.md diff --git a/partition_list_variable/build_only_new_partitions.py b/_old/partition_list_variable/build_only_new_partitions.py similarity index 100% rename from partition_list_variable/build_only_new_partitions.py rename to _old/partition_list_variable/build_only_new_partitions.py diff --git a/partition_list_variable/build_whole_output_custom.py b/_old/partition_list_variable/build_whole_output_custom.py similarity index 100% rename from partition_list_variable/build_whole_output_custom.py rename to _old/partition_list_variable/build_whole_output_custom.py diff --git a/python_io_examples/README.md b/_old/python_io_examples/README.md similarity index 100% rename from python_io_examples/README.md rename to _old/python_io_examples/README.md diff --git a/python_io_examples/pandas_chunked_read_write.py b/_old/python_io_examples/pandas_chunked_read_write.py similarity index 100% rename from python_io_examples/pandas_chunked_read_write.py rename to _old/python_io_examples/pandas_chunked_read_write.py diff --git a/reco/README.md b/_old/reco/README.md similarity index 100% rename from reco/README.md rename to _old/reco/README.md diff --git a/reco/__init__.py b/_old/reco/__init__.py similarity index 100% rename from reco/__init__.py rename to _old/reco/__init__.py diff --git a/reco/surprise_wrapper.py b/_old/reco/surprise_wrapper.py similarity index 100% rename from reco/surprise_wrapper.py rename to _old/reco/surprise_wrapper.py diff --git a/visualization/flask-webapps/authenticate-calls/README.md b/_old/visualization/flask-webapps/authenticate-calls/README.md similarity index 100% rename from visualization/flask-webapps/authenticate-calls/README.md rename to _old/visualization/flask-webapps/authenticate-calls/README.md diff --git a/visualization/flask-webapps/authenticate-calls/app.js b/_old/visualization/flask-webapps/authenticate-calls/app.js similarity index 100% rename from visualization/flask-webapps/authenticate-calls/app.js rename to _old/visualization/flask-webapps/authenticate-calls/app.js diff --git a/visualization/flask-webapps/authenticate-calls/backend.py b/_old/visualization/flask-webapps/authenticate-calls/backend.py similarity index 100% rename from visualization/flask-webapps/authenticate-calls/backend.py rename to _old/visualization/flask-webapps/authenticate-calls/backend.py diff --git a/visualization/flask-webapps/flask-session-per-browser/README.md b/_old/visualization/flask-webapps/flask-session-per-browser/README.md similarity index 100% rename from visualization/flask-webapps/flask-session-per-browser/README.md rename to _old/visualization/flask-webapps/flask-session-per-browser/README.md diff --git a/visualization/flask-webapps/flask-session-per-browser/app.js b/_old/visualization/flask-webapps/flask-session-per-browser/app.js similarity index 100% rename from visualization/flask-webapps/flask-session-per-browser/app.js rename to _old/visualization/flask-webapps/flask-session-per-browser/app.js diff --git a/visualization/flask-webapps/flask-session-per-browser/backend.py b/_old/visualization/flask-webapps/flask-session-per-browser/backend.py similarity index 100% rename from visualization/flask-webapps/flask-session-per-browser/backend.py rename to _old/visualization/flask-webapps/flask-session-per-browser/backend.py diff --git a/visualization/flask-webapps/flask-session-per-browser/body.html b/_old/visualization/flask-webapps/flask-session-per-browser/body.html similarity index 100% rename from visualization/flask-webapps/flask-session-per-browser/body.html rename to _old/visualization/flask-webapps/flask-session-per-browser/body.html diff --git a/visualization/flask-webapps/flask-session-per-frontend/README.md b/_old/visualization/flask-webapps/flask-session-per-frontend/README.md similarity index 100% rename from visualization/flask-webapps/flask-session-per-frontend/README.md rename to _old/visualization/flask-webapps/flask-session-per-frontend/README.md diff --git a/visualization/flask-webapps/flask-session-per-frontend/app.js b/_old/visualization/flask-webapps/flask-session-per-frontend/app.js similarity index 100% rename from visualization/flask-webapps/flask-session-per-frontend/app.js rename to _old/visualization/flask-webapps/flask-session-per-frontend/app.js diff --git a/visualization/flask-webapps/flask-session-per-frontend/backend.py b/_old/visualization/flask-webapps/flask-session-per-frontend/backend.py similarity index 100% rename from visualization/flask-webapps/flask-session-per-frontend/backend.py rename to _old/visualization/flask-webapps/flask-session-per-frontend/backend.py diff --git a/visualization/flask-webapps/flask-session-per-frontend/body.html b/_old/visualization/flask-webapps/flask-session-per-frontend/body.html similarity index 100% rename from visualization/flask-webapps/flask-session-per-frontend/body.html rename to _old/visualization/flask-webapps/flask-session-per-frontend/body.html diff --git a/visualization/flask-webapps/flask-session-per-user/README.md b/_old/visualization/flask-webapps/flask-session-per-user/README.md similarity index 100% rename from visualization/flask-webapps/flask-session-per-user/README.md rename to _old/visualization/flask-webapps/flask-session-per-user/README.md diff --git a/visualization/flask-webapps/flask-session-per-user/app.js b/_old/visualization/flask-webapps/flask-session-per-user/app.js similarity index 100% rename from visualization/flask-webapps/flask-session-per-user/app.js rename to _old/visualization/flask-webapps/flask-session-per-user/app.js diff --git a/visualization/flask-webapps/flask-session-per-user/backend.py b/_old/visualization/flask-webapps/flask-session-per-user/backend.py similarity index 100% rename from visualization/flask-webapps/flask-session-per-user/backend.py rename to _old/visualization/flask-webapps/flask-session-per-user/backend.py diff --git a/visualization/flask-webapps/flask-session-per-user/body.html b/_old/visualization/flask-webapps/flask-session-per-user/body.html similarity index 100% rename from visualization/flask-webapps/flask-session-per-user/body.html rename to _old/visualization/flask-webapps/flask-session-per-user/body.html diff --git a/visualization/shiny/authenticate-calls/README.md b/_old/visualization/shiny/authenticate-calls/README.md similarity index 100% rename from visualization/shiny/authenticate-calls/README.md rename to _old/visualization/shiny/authenticate-calls/README.md diff --git a/visualization/shiny/authenticate-calls/server.R b/_old/visualization/shiny/authenticate-calls/server.R similarity index 100% rename from visualization/shiny/authenticate-calls/server.R rename to _old/visualization/shiny/authenticate-calls/server.R diff --git a/visualization/shiny/authenticate-calls/ui.R b/_old/visualization/shiny/authenticate-calls/ui.R similarity index 100% rename from visualization/shiny/authenticate-calls/ui.R rename to _old/visualization/shiny/authenticate-calls/ui.R diff --git a/visualization/shiny/shiny-and-dygraphs/README.md b/_old/visualization/shiny/shiny-and-dygraphs/README.md similarity index 100% rename from visualization/shiny/shiny-and-dygraphs/README.md rename to _old/visualization/shiny/shiny-and-dygraphs/README.md diff --git a/visualization/shiny/shiny-and-dygraphs/UI.R b/_old/visualization/shiny/shiny-and-dygraphs/UI.R similarity index 100% rename from visualization/shiny/shiny-and-dygraphs/UI.R rename to _old/visualization/shiny/shiny-and-dygraphs/UI.R diff --git a/visualization/shiny/shiny-and-dygraphs/server.R b/_old/visualization/shiny/shiny-and-dygraphs/server.R similarity index 100% rename from visualization/shiny/shiny-and-dygraphs/server.R rename to _old/visualization/shiny/shiny-and-dygraphs/server.R diff --git a/admin/README.md b/admin/README.md new file mode 100644 index 0000000..5b2eada --- /dev/null +++ b/admin/README.md @@ -0,0 +1,10 @@ +# Administration + +## TODO + +- [ ] List jobs currently running +- [ ] Create a code environment from a list of packages +- [ ] Programmatically add impersonation rules +- [ ] Create API service infrastructure +- [ ] Create a mapping between code environments and Python/R recipes +- [ ] Create and manage project folders diff --git a/applications/README.md b/applications/README.md new file mode 100644 index 0000000..29b471d --- /dev/null +++ b/applications/README.md @@ -0,0 +1,4 @@ +# Dataiku Applications + +- [ ] App-as-API example +- [ ] List and cleanup application instances diff --git a/datasets/README.md b/datasets/README.md new file mode 100644 index 0000000..e60dc27 --- /dev/null +++ b/datasets/README.md @@ -0,0 +1,10 @@ +# Datasets + +## TODO + +- [ ] Programmatically build partitions +- [ ] Read from/write to non-local-FS-folders +- [ ] Flag and delete orphaned datasets +- [ ] Schema propagation from updated dataset +- [ ] Create "Upload" dataset and add/replace file(s) + diff --git a/machine_learning/README.md b/machine_learning/README.md new file mode 100644 index 0000000..fd20a25 --- /dev/null +++ b/machine_learning/README.md @@ -0,0 +1,8 @@ +# Machine learning + +- [ ] "Pure code" model training and batch scoring in PyTorch +- [ ] "Pure code" model training and batch scoring in Tensorflow 2.x +- [ ] Visual ML: custom preprocessing (numerical + categorical) +- [ ] Visual ML: custom evaluation metric (classification + regression) +- [ ] Visual ML: custom Python model (classification + regression) +- [ ] Visual ML: download pre-trained model in a managed folder diff --git a/metrics_and_checks/README.md b/metrics_and_checks/README.md new file mode 100644 index 0000000..cc00c68 --- /dev/null +++ b/metrics_and_checks/README.md @@ -0,0 +1,4 @@ +# Metrics and checks + +- [ ] Retrieve metrics history of a dataset +- [ ] Retrieve metrics history of a model diff --git a/scenarios/README.md b/scenarios/README.md new file mode 100644 index 0000000..940ffb3 --- /dev/null +++ b/scenarios/README.md @@ -0,0 +1,3 @@ +# Scenarios + +- [ ] Implement a timeout for a particular scenario step diff --git a/statistics/README.md b/statistics/README.md new file mode 100644 index 0000000..54f19e4 --- /dev/null +++ b/statistics/README.md @@ -0,0 +1,3 @@ +# Interactive statistics + +- [ ] Retrieve correlation matrix from dataset/worksheet/card diff --git a/webapps/README.md b/webapps/README.md new file mode 100644 index 0000000..ed38fbf --- /dev/null +++ b/webapps/README.md @@ -0,0 +1,7 @@ +# Webapps + +- [ ] Authenticate users on a Flask webapp +- [ ] Maintaining a model per browser/frontend/user on a Flask webapp +- [ ] Authenticate users on a Shiny webapp +- [ ] Display interactive time series in a Shiny webapp +- [ ] Display interactive time series in a Bokeh webapp From 073c90738ab83d4654d07d08e3c0a9890c97999f Mon Sep 17 00:00:00 2001 From: Harizo Rajaona Date: Wed, 4 Nov 2020 19:07:09 +0100 Subject: [PATCH 02/14] Add instructions for venv setup --- README.md | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/README.md b/README.md index d5e8daa..f63b577 100644 --- a/README.md +++ b/README.md @@ -3,6 +3,21 @@ Various code samples for using DSS ## Refactoring +### Getting started + +(DSS >= 8.0.3) + +Create a dedicated virtual environment and install the following packages: +* `dataiku-internal-client`: follow the instructions in the [DSS doc](https://doc.dataiku.com/dss/latest/python-api/outside-usage.html#installing-the-package) +* `dataikuapi`: + ``` + $ pip install dataiku-api-client + ``` +* `pandas`: + ``` + $ pip install "pandas>=1.0,<1.1" + ``` + ### Structure ``` @@ -20,3 +35,4 @@ dss-code-samples ``` + From 0559f646332205123db8376df11f737f7c573f21 Mon Sep 17 00:00:00 2001 From: Harizo Rajaona Date: Wed, 4 Nov 2020 22:50:47 +0100 Subject: [PATCH 03/14] WIP working version of list_jobs --- .gitignore | 3 +++ admin/list_jobs.py | 30 ++++++++++++++++++++++++++++++ requirements.txt | 13 +++++++++++++ 3 files changed, 46 insertions(+) create mode 100644 .gitignore create mode 100644 admin/list_jobs.py create mode 100644 requirements.txt diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..07bc6ad --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +venv/ +.vscode/ + diff --git a/admin/list_jobs.py b/admin/list_jobs.py new file mode 100644 index 0000000..184c22b --- /dev/null +++ b/admin/list_jobs.py @@ -0,0 +1,30 @@ +import dataiku +from datetime import datetime + +def list_jobs_by_status(client=None, project_key=None): + """ + """ + + project = client.get_project(project_key) + jobs_by_status = {"RUNNING": [], + "FAILED": [], + "DONE": [], + "ABORTED": []} + for job in project.list_jobs(): + if "state" not in job: + jobs_by_status["RUNNING"].append(job) + else: + jobs_by_status[job["state"]].append(job) + return jobs_by_status + + +def filter_jobs_by_start_date(jobs_by_status=None, start_date=None): + """ + """ + + start_date_timestamp = int(datetime.strptime(start_date, "%Y/%m/%d").strftime("%s")) * 1000 + is_after_start_date = lambda x, d: x["def"]["initiationTimestamp"] > d + jobs_after_start_date = {_status: [job for job in _list if is_after_start_date(job, start_date_timestamp)] for _status, _list in jobs_by_status.items()} + return jobs_after_start_date + + diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..63b9507 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,13 @@ +certifi==2020.6.20 +chardet==3.0.4 +idna==2.8 +numpy==1.19.4 +pandas==1.0.5 +python-dateutil==2.8.0 +pytz==2019.2 +requests==2.22.0 +six==1.15.0 +urllib3==1.25.11 +dataiku-api-client==8.0.0 +http://localhost:40000/public/packages/dataiku-internal-client.tar.gz + From a998e91df377bd12b9fa09714ff3e2cac38418cb Mon Sep 17 00:00:00 2001 From: Harizo Rajaona Date: Thu, 5 Nov 2020 10:50:07 +0100 Subject: [PATCH 04/14] Add docstrings and abort function --- admin/list_jobs.py | 44 +++++++++++++++++++++++++++++++++++++++----- 1 file changed, 39 insertions(+), 5 deletions(-) diff --git a/admin/list_jobs.py b/admin/list_jobs.py index 184c22b..0939cbb 100644 --- a/admin/list_jobs.py +++ b/admin/list_jobs.py @@ -1,8 +1,16 @@ import dataiku from datetime import datetime + def list_jobs_by_status(client=None, project_key=None): - """ + """List jobs by current status in a given project. + + Args: + client: A handle on the target DSS instance + project_key: A string representing the target project key + + Returns: + jobs_by_status: A dict of lists mapping jobs and their states """ project = client.get_project(project_key) @@ -11,20 +19,46 @@ def list_jobs_by_status(client=None, project_key=None): "DONE": [], "ABORTED": []} for job in project.list_jobs(): - if "state" not in job: + if not job["stableState"]: jobs_by_status["RUNNING"].append(job) else: jobs_by_status[job["state"]].append(job) return jobs_by_status -def filter_jobs_by_start_date(jobs_by_status=None, start_date=None): - """ - """ +def filter_jobs_by_start_date(client=None, project_key=None, start_date=None): + """List jobs that were started after a specific date. + + Args: + client: A handle on the target DSS instance + project_key: A string representing the target project key + start_date: A string of the form 'YYYY/mm/dd' + + Returns: + jobs_after_start_date: A dict of lists mapping jobs and their states + """ + jobs_by_status = list_jobs_by_status(client, project_key) start_date_timestamp = int(datetime.strptime(start_date, "%Y/%m/%d").strftime("%s")) * 1000 is_after_start_date = lambda x, d: x["def"]["initiationTimestamp"] > d jobs_after_start_date = {_status: [job for job in _list if is_after_start_date(job, start_date_timestamp)] for _status, _list in jobs_by_status.items()} return jobs_after_start_date +def abort_all_running_jobs(client=None, project_key=None): + """Terminate all running jobs in a project. + + Args: + client: A handle on the target DSS instance + project_key: A string representing the target project key + """ + + project = client.get_project(project_key) + aborted_jobs = [] + for job in project.list_jobs(): + if not job["stableState"]: + job_id = job["def"]["id"] + aborted_jobs.append(job_id) + project.get_job(job_id).abort() + print(f"Deleted {len(aborted_jobs)} running jobs") + From d6a215fd3b43636436a4119077ea5403654a7a9b Mon Sep 17 00:00:00 2001 From: Harizo Rajaona Date: Thu, 5 Nov 2020 11:31:24 +0100 Subject: [PATCH 05/14] Make first samples importable --- .gitignore | 1 + admin/__init__.py | 0 admin/{list_jobs.py => job_utils.py} | 0 datasets/README.md | 1 + 4 files changed, 2 insertions(+) create mode 100644 admin/__init__.py rename admin/{list_jobs.py => job_utils.py} (100%) diff --git a/.gitignore b/.gitignore index 07bc6ad..48abe11 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ venv/ .vscode/ +__pycache__/ diff --git a/admin/__init__.py b/admin/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/admin/list_jobs.py b/admin/job_utils.py similarity index 100% rename from admin/list_jobs.py rename to admin/job_utils.py diff --git a/datasets/README.md b/datasets/README.md index e60dc27..9894a39 100644 --- a/datasets/README.md +++ b/datasets/README.md @@ -3,6 +3,7 @@ ## TODO - [ ] Programmatically build partitions +- [ ] Retrieve last build date (PR #3) - [ ] Read from/write to non-local-FS-folders - [ ] Flag and delete orphaned datasets - [ ] Schema propagation from updated dataset From 8a601b629d944fca86d4b04f663ea146a893fa78 Mon Sep 17 00:00:00 2001 From: Harizo Rajaona Date: Fri, 6 Nov 2020 11:38:34 +0100 Subject: [PATCH 06/14] More ideas --- README.md | 12 ++++++++++++ datasets/README.md | 1 + 2 files changed, 13 insertions(+) diff --git a/README.md b/README.md index f63b577..20111b8 100644 --- a/README.md +++ b/README.md @@ -7,6 +7,18 @@ Various code samples for using DSS (DSS >= 8.0.3) +#### Use within DSS (as project library) +- Register in Project Lib Git +- No need to specify remote DSS params +- Profit + +#### Outside of DSS +- Clone repository, tarzip it +- Create virtualenv with dss requirements and tarzipped archive +- Profit ...? + +You can reuse them as they are, customize them for your own needs, and even package them into plugins. + Create a dedicated virtual environment and install the following packages: * `dataiku-internal-client`: follow the instructions in the [DSS doc](https://doc.dataiku.com/dss/latest/python-api/outside-usage.html#installing-the-package) * `dataikuapi`: diff --git a/datasets/README.md b/datasets/README.md index 9894a39..9043a51 100644 --- a/datasets/README.md +++ b/datasets/README.md @@ -5,6 +5,7 @@ - [ ] Programmatically build partitions - [ ] Retrieve last build date (PR #3) - [ ] Read from/write to non-local-FS-folders +- [ ] Run containerized execution with input/output data in managed folders - [ ] Flag and delete orphaned datasets - [ ] Schema propagation from updated dataset - [ ] Create "Upload" dataset and add/replace file(s) From 699f2cd81d60f613e85ae5e08124fdce03f55441 Mon Sep 17 00:00:00 2001 From: Harizo Rajaona Date: Mon, 16 Nov 2020 17:47:18 +0100 Subject: [PATCH 07/14] Add edit_project_permission() --- admin/README.md | 16 +++++++++++++--- admin/project_utils.py | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 46 insertions(+), 3 deletions(-) create mode 100644 admin/project_utils.py diff --git a/admin/README.md b/admin/README.md index 5b2eada..0ac99db 100644 --- a/admin/README.md +++ b/admin/README.md @@ -2,9 +2,19 @@ ## TODO -- [ ] List jobs currently running +### Jobs +- [x] List jobs currently running + +### Projects +- [x] Assign group permissions to project +- [ ] Create & manage project folders +### Code environments - [ ] Create a code environment from a list of packages +- [ ] Create a mapping between code environments and Python/R recipes + +### Security & user isolation - [ ] Programmatically add impersonation rules + +### Infrastructure - [ ] Create API service infrastructure -- [ ] Create a mapping between code environments and Python/R recipes -- [ ] Create and manage project folders + diff --git a/admin/project_utils.py b/admin/project_utils.py new file mode 100644 index 0000000..503dddd --- /dev/null +++ b/admin/project_utils.py @@ -0,0 +1,33 @@ +import dataiku + +import dataiku + +def edit_project_permissions(client=None, project_key=None, group=None, perms=None, revoke=False): + """Grant or revoke project permissions for a given group. + + Args: + client: A handle on the target DSS instance + project_key: A string representing the target project key + group: A string representing the target group name + perms: A list of permissions to grant + revoke: A boolean for completely revoking access to the project + """ + + prj = client.get_project(project_key) + perm_obj = prj.get_permissions() + perm_list = perm_obj["permissions"] + for p in perm_list: + if p["group"] == group: + print("Deleting existing permissions...") + perm_list.remove(p) + if revoke: + perm_obj["permissions"] = perm_list + print(f"Revoking all permissions on project {project_key} for group {group}") + else: + if not perms: + print("Missing permission list, will grant ADMIN instead...") + perms = ["admin"] + new_group_perms = dict({"group": group}, **{p: True for p in perms}) + perm_obj["permissions"].append(new_group_perms) + print(f"Granting {perms} to group {group} on project {project_key}...") + prj.set_permissions(perm_obj) \ No newline at end of file From d2f0eb9a33974f7deee83dd6d58f4b609abc203a Mon Sep 17 00:00:00 2001 From: Harizo Rajaona Date: Sat, 19 Dec 2020 10:36:33 +0100 Subject: [PATCH 08/14] More things on todo list --- machine_learning/README.md | 24 ++++++++++++++++++++++++ webapps/README.md | 1 + 2 files changed, 25 insertions(+) diff --git a/machine_learning/README.md b/machine_learning/README.md index fd20a25..7ef4240 100644 --- a/machine_learning/README.md +++ b/machine_learning/README.md @@ -1,8 +1,32 @@ # Machine learning +- [ ] List all models and corresp. active version in a project - [ ] "Pure code" model training and batch scoring in PyTorch - [ ] "Pure code" model training and batch scoring in Tensorflow 2.x +- [ ] Custom model deployed on API service - [ ] Visual ML: custom preprocessing (numerical + categorical) - [ ] Visual ML: custom evaluation metric (classification + regression) - [ ] Visual ML: custom Python model (classification + regression) - [ ] Visual ML: download pre-trained model in a managed folder +- [ ] Retrieve and deploy the best model of a training session in the visual analysis + ``` + import dataiku +client = dataiku.api_client() +project = client.get_project('YOUR_PROJECT_KEY') + +analysis_id = 'k2BRw36W' # this can be found in the analysis URL or using project.list_analyses() +ml_taskid = 'aG8nyE8E' # this can be found in the mltask URL or using analysis.list_ml_tasks() +model_name = 'my_model' # name of the model that vill be deployed to flow +train_set = 'train' # name of my trainset + +analysis = project.get_analysis(analysis_id) +mltask = analysis.get_ml_task(ml_taskid) +trained_models = mltask.get_trained_models_ids() +trained_models_snippets = [mltask.get_trained_model_snippet(model) for model in trained_models] +​ +# Compare models to find the one you want to deploy, here we want to deploy the model with best r2 score +best_model = max(trained_models_snippets, key=lambda x:x['r2']) +# Deploy the best model to the flow, can also use mltask.redeploy_to_flow() to update an existing model +mltask.deploy_to_flow(best_model['fullModelId'], model_name, train_set ) +``` + diff --git a/webapps/README.md b/webapps/README.md index ed38fbf..990c06c 100644 --- a/webapps/README.md +++ b/webapps/README.md @@ -1,5 +1,6 @@ # Webapps +- [ ] Helpers (REST) to start/stop/list webapp backends - [ ] Authenticate users on a Flask webapp - [ ] Maintaining a model per browser/frontend/user on a Flask webapp - [ ] Authenticate users on a Shiny webapp From b67fe385512432f4dfc50d8b15ce12597fcefc85 Mon Sep 17 00:00:00 2001 From: Harizo Rajaona Date: Sun, 20 Dec 2020 10:34:19 +0100 Subject: [PATCH 09/14] Add expore_saved_models --- admin/project_utils.py | 4 +--- machine_learning/saved_model_utils.py | 30 +++++++++++++++++++++++++++ 2 files changed, 31 insertions(+), 3 deletions(-) create mode 100644 machine_learning/saved_model_utils.py diff --git a/admin/project_utils.py b/admin/project_utils.py index 503dddd..8187901 100644 --- a/admin/project_utils.py +++ b/admin/project_utils.py @@ -1,7 +1,5 @@ import dataiku -import dataiku - def edit_project_permissions(client=None, project_key=None, group=None, perms=None, revoke=False): """Grant or revoke project permissions for a given group. @@ -30,4 +28,4 @@ def edit_project_permissions(client=None, project_key=None, group=None, perms=No new_group_perms = dict({"group": group}, **{p: True for p in perms}) perm_obj["permissions"].append(new_group_perms) print(f"Granting {perms} to group {group} on project {project_key}...") - prj.set_permissions(perm_obj) \ No newline at end of file + prj.set_permissions(perm_obj) diff --git a/machine_learning/saved_model_utils.py b/machine_learning/saved_model_utils.py new file mode 100644 index 0000000..00ef12d --- /dev/null +++ b/machine_learning/saved_model_utils.py @@ -0,0 +1,30 @@ +import dataiku + +def explore_saved_models(client=None, project_key=None): + """List saved models of a project and give details on the active versions. + + Args: + client: A handle on the target DSS instance + project_key: A string representing the target project key + + Returns: + smdl_list: A dict with all saved model ids and perf + algorithm + for the active versions. + + """ + smdl_list = [] + prj = client.get_project(project_key) + smdl_ids = [x["id"] for x in prj.list_saved_models()] + for smdl in smdl_ids: + data = {} + obj = prj.get_saved_model(smdl) + data["version_ids"] = [m["id"] for m in obj.list_versions()] + active_version_id = obj.get_active_version()["id"] + active_version_details = obj.get_version_details(active_version_id) + data["active_version"] = {"id": active_version_id, + "algorithm": active_version_details.details["actualParams"]["resolved"]["algorithm"], + "performance_metrics": active_version_details.get_performance_metrics()} + smdl_list.append(data) + return smdl_list + + From f0e20b3179553d58a113c6cf3e3992114c25169e Mon Sep 17 00:00:00 2001 From: Harizo Rajaona Date: Tue, 12 Jan 2021 12:24:06 +0100 Subject: [PATCH 10/14] More details in READMEs and mltask stuff --- admin/README.md | 20 -------------------- admin/spark_utils.py | 6 ++++++ ci_cd/README.md | 3 +++ deployer/README.md | 3 +++ machine_learning/README.md | 2 +- machine_learning/mltask_utils.py | 32 ++++++++++++++++++++++++++++++++ projects/README.md | 27 +++++++++++++++++++++++++++ scenarios/README.md | 5 +++++ 8 files changed, 77 insertions(+), 21 deletions(-) delete mode 100644 admin/README.md create mode 100644 admin/spark_utils.py create mode 100644 ci_cd/README.md create mode 100644 deployer/README.md create mode 100644 machine_learning/mltask_utils.py create mode 100644 projects/README.md diff --git a/admin/README.md b/admin/README.md deleted file mode 100644 index 0ac99db..0000000 --- a/admin/README.md +++ /dev/null @@ -1,20 +0,0 @@ -# Administration - -## TODO - -### Jobs -- [x] List jobs currently running - -### Projects -- [x] Assign group permissions to project -- [ ] Create & manage project folders -### Code environments -- [ ] Create a code environment from a list of packages -- [ ] Create a mapping between code environments and Python/R recipes - -### Security & user isolation -- [ ] Programmatically add impersonation rules - -### Infrastructure -- [ ] Create API service infrastructure - diff --git a/admin/spark_utils.py b/admin/spark_utils.py new file mode 100644 index 0000000..d7b5a10 --- /dev/null +++ b/admin/spark_utils.py @@ -0,0 +1,6 @@ +import dataiku + +def add_spark_config(client=None, config=None): + return NotImplementedError + + diff --git a/ci_cd/README.md b/ci_cd/README.md new file mode 100644 index 0000000..5b70d0d --- /dev/null +++ b/ci_cd/README.md @@ -0,0 +1,3 @@ +# CI/CD + +- [ ] (?) Example of Azure Devops pipeline to deploy on the automation node diff --git a/deployer/README.md b/deployer/README.md new file mode 100644 index 0000000..b15d067 --- /dev/null +++ b/deployer/README.md @@ -0,0 +1,3 @@ +- Keep a database connection persistent to speed up response time by opening a connection pool outside of the the API function +- Image classification scoring on a custom deep learning model +- \ No newline at end of file diff --git a/machine_learning/README.md b/machine_learning/README.md index 7ef4240..8e63100 100644 --- a/machine_learning/README.md +++ b/machine_learning/README.md @@ -1,6 +1,6 @@ # Machine learning -- [ ] List all models and corresp. active version in a project +- [x] List all models and corresp. active version in a project - [ ] "Pure code" model training and batch scoring in PyTorch - [ ] "Pure code" model training and batch scoring in Tensorflow 2.x - [ ] Custom model deployed on API service diff --git a/machine_learning/mltask_utils.py b/machine_learning/mltask_utils.py new file mode 100644 index 0000000..723f022 --- /dev/null +++ b/machine_learning/mltask_utils.py @@ -0,0 +1,32 @@ +import dataiku + +def deploy_best_model(client=None, + project_key=None, + analysis_id=None, + mltask_id=None, + metric=None): + """Deploy the best model (according to the input metric) of a mltask to the flow. + + Args: + client: A handle on the DSS instance + project_key: A string representing the target project key + analysis_id: A string linking to the target visual analysis + mltask_id: A string linking to the target mltask in a given analysis + metric: A string defining which metric to use for performance ranking + + Returns: + """ + # WIP + prj = client.get_project(project_key) + analysis = prj.get_analysis(analysis_id) + mltask = analysis.get_ml_task(mltask_id) + trained_models = mltask.get_trained_models_ids() + trained_models_snippets = [mltask.get_trained_model_snippet(m) for m in trained_models] + best_model = max(trained_models_snippets, key=lambda x:x[metric]) + return best_model + + + + + + diff --git a/projects/README.md b/projects/README.md new file mode 100644 index 0000000..f1ccdbb --- /dev/null +++ b/projects/README.md @@ -0,0 +1,27 @@ +- Build all + ```python + client = dataiku.api_client() + project = client.get_project(dataiku.default_project_key()) + flow = project.get_flow() + graph = flow.get_graph() + for k,v in graph.data.get('nodes').items(): + if v.get('successors') == []: + definition = { + "type" : 'RECURSIVE_BUILD', + "outputs" : [{"id": k}] + } + print('Building dataset {}'.format(k)) + job = project.start_job(definition) + ``` + Will need adjustments if there are saved models. + +- Build specific tags only +- Build specific zones only +- Detect schema changes on a dataset and propagate them + ```python + settings = dataset.get_settings() + settings.get_raw()["schema"] = {"columns":[]} + settings.save() + new_settings = dataset.autodetect_settings() + new_settings.save() +``` diff --git a/scenarios/README.md b/scenarios/README.md index 940ffb3..43de12f 100644 --- a/scenarios/README.md +++ b/scenarios/README.md @@ -1,3 +1,8 @@ # Scenarios - [ ] Implement a timeout for a particular scenario step + +- View all the "run after scenario" dependences between projects +> you'll need to write some code using the public API to loop over the scenario settings, look for follow_scenariorun triggers, and build the dependency tree yourself + + From 30a8029d11da7233c8d54f151aab6e2df775030b Mon Sep 17 00:00:00 2001 From: Harizo Rajaona Date: Tue, 12 Jan 2021 16:56:43 +0100 Subject: [PATCH 11/14] Scenarios: append email reporter recipients --- scenarios/reporters.py | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 scenarios/reporters.py diff --git a/scenarios/reporters.py b/scenarios/reporters.py new file mode 100644 index 0000000..0186e74 --- /dev/null +++ b/scenarios/reporters.py @@ -0,0 +1,33 @@ +import dataiku + +def add_email_recipients(client=None, project_key=None, scenario_ids=[], recipients=[]): + """Append additional recipients to scenario email reporters. + + Args: + client: A handle on the target DSS instance + project_key: A string representing the target project key + scenario_ids: A list of scenario ID strings + recipients: A list of email address strings + + """ + + prj = client.get_project(project_key) + if not scenario_ids: + print("No scenario id specified, will apply to ALL scenarios") + scenario_ids = [scn["id"] for scn in prj.list_scenarios()] + + for scn_id in scenario_ids: + handle = prj.get_scenario(scn_id) + settings = handle.get_settings() + reporters = settings.raw_reporters + if not reporters: + print("No reporter found.") + else: + for rep in reporters: + if rep["messaging"]["type"] == "mail-scenario": + if rep["messaging"]["configuration"]["recipient"]: + sep = ', ' + else: + sep = '' + rep["messaging"]["configuration"]["recipient"] += (sep + ', '.join(recipients)) + settings.save() From c8dea96a1d931b5057ae4fe32dff9ef2be572d8e Mon Sep 17 00:00:00 2001 From: Harizo Rajaona Date: Wed, 13 Jan 2021 15:47:21 +0100 Subject: [PATCH 12/14] Improve dbest model deployment from ML task --- applications/README.md | 4 -- ci_cd/README.md | 3 -- machine_learning/README.md | 32 ------------- machine_learning/mltask_utils.py | 81 +++++++++++++++++++++++++++----- scenarios/reporters.py | 1 - 5 files changed, 69 insertions(+), 52 deletions(-) delete mode 100644 applications/README.md delete mode 100644 ci_cd/README.md delete mode 100644 machine_learning/README.md diff --git a/applications/README.md b/applications/README.md deleted file mode 100644 index 29b471d..0000000 --- a/applications/README.md +++ /dev/null @@ -1,4 +0,0 @@ -# Dataiku Applications - -- [ ] App-as-API example -- [ ] List and cleanup application instances diff --git a/ci_cd/README.md b/ci_cd/README.md deleted file mode 100644 index 5b70d0d..0000000 --- a/ci_cd/README.md +++ /dev/null @@ -1,3 +0,0 @@ -# CI/CD - -- [ ] (?) Example of Azure Devops pipeline to deploy on the automation node diff --git a/machine_learning/README.md b/machine_learning/README.md deleted file mode 100644 index 8e63100..0000000 --- a/machine_learning/README.md +++ /dev/null @@ -1,32 +0,0 @@ -# Machine learning - -- [x] List all models and corresp. active version in a project -- [ ] "Pure code" model training and batch scoring in PyTorch -- [ ] "Pure code" model training and batch scoring in Tensorflow 2.x -- [ ] Custom model deployed on API service -- [ ] Visual ML: custom preprocessing (numerical + categorical) -- [ ] Visual ML: custom evaluation metric (classification + regression) -- [ ] Visual ML: custom Python model (classification + regression) -- [ ] Visual ML: download pre-trained model in a managed folder -- [ ] Retrieve and deploy the best model of a training session in the visual analysis - ``` - import dataiku -client = dataiku.api_client() -project = client.get_project('YOUR_PROJECT_KEY') - -analysis_id = 'k2BRw36W' # this can be found in the analysis URL or using project.list_analyses() -ml_taskid = 'aG8nyE8E' # this can be found in the mltask URL or using analysis.list_ml_tasks() -model_name = 'my_model' # name of the model that vill be deployed to flow -train_set = 'train' # name of my trainset - -analysis = project.get_analysis(analysis_id) -mltask = analysis.get_ml_task(ml_taskid) -trained_models = mltask.get_trained_models_ids() -trained_models_snippets = [mltask.get_trained_model_snippet(model) for model in trained_models] -​ -# Compare models to find the one you want to deploy, here we want to deploy the model with best r2 score -best_model = max(trained_models_snippets, key=lambda x:x['r2']) -# Deploy the best model to the flow, can also use mltask.redeploy_to_flow() to update an existing model -mltask.deploy_to_flow(best_model['fullModelId'], model_name, train_set ) -``` - diff --git a/machine_learning/mltask_utils.py b/machine_learning/mltask_utils.py index 723f022..e5e062b 100644 --- a/machine_learning/mltask_utils.py +++ b/machine_learning/mltask_utils.py @@ -1,32 +1,89 @@ import dataiku -def deploy_best_model(client=None, +def get_best_model(client=None, project_key=None, analysis_id=None, - mltask_id=None, + ml_task_id=None, metric=None): - """Deploy the best model (according to the input metric) of a mltask to the flow. + """Return the 'best model' (according to the input metric) of a ML task. Args: client: A handle on the DSS instance project_key: A string representing the target project key - analysis_id: A string linking to the target visual analysis - mltask_id: A string linking to the target mltask in a given analysis + analysis_id: A string linking to the target visual analysis. + Can be found in the analysis URL or via + dataikuapi.dss.project.DSSProject.list_analyses() + ml_task_id: A string linking to the target MLTask in a given analysis. + Can be found in the ML task URL or via + dataikuapi.dss.analysis.DSSAnalysis.list_ml_tasks() metric: A string defining which metric to use for performance ranking Returns: + ml_task: A handle to interact with the ML task. + Useful when (re)deploying the model. + best_model_snippet: A string containing the ID of the ML task's 'best model' + """ - # WIP prj = client.get_project(project_key) analysis = prj.get_analysis(analysis_id) - mltask = analysis.get_ml_task(mltask_id) - trained_models = mltask.get_trained_models_ids() - trained_models_snippets = [mltask.get_trained_model_snippet(m) for m in trained_models] - best_model = max(trained_models_snippets, key=lambda x:x[metric]) - return best_model - + ml_task = analysis.get_ml_task(ml_task_id) + trained_models = ml_task.get_trained_models_ids() + trained_models_snippets = [ml_task.get_trained_model_snippet(m) for m in trained_models] + # Assumes that for your metric, "higher is better" + best_model_snippet = max(trained_models_snippets, key=lambda x:x[metric]) + best_model_id = best_model_snippet["fullModelId"] + return ml_task, best_model_id +def deploy_with_best_model(client=None, + project_key=None, + analysis_id=None, + ml_task_id=None, + metric=None, + saved_model_name=None, + training_dataset=None): + """Create a new Saved Model in the Flow with the 'best model' of a ML task. + Args: + client: A handle on the DSS instance + project_key: A string representing the target project key. + analysis_id: A string linking to the target visual analysis. + Can be found in the analysis URL or via + dataikuapi.dss.project.DSSProject.list_analyses(). + ml_task_id: A string linking to the target MLTask in a given analysis. + Can be found in the ML task URL or via + dataikuapi.dss.analysis.DSSAnalysis.list_ml_tasks(). + metric: A string defining which metric to use for performance ranking. + saved_model_name: A string to name the newly-created Saved Model. + training_dataset: A string representing the name of the dataset + used as train set. + """ + ml_task, best_model_id = get_best_model(client, + project_key, + analysis_id, + ml_task_id, + metric) + ml_task.deploy_to_flow(best_model_id, + saved_model_name, + training_dataset) +def update_with_best_model(client=None, + project_key=None, + analysis_id=None, + ml_task_id=None, + metric=None, + saved_model_name=None, + activate=True): + """Update an existing Saved Model in the Flow with the 'best model' + of a ML task. + """ + ml_task, best_model_id = get_best_model(client, + project_key, + analysis_id, + ml_task_id, + metric) + training_recipe_name = f"train_{saved_model_name}" + ml_task.redeploy_to_flow(model_id=best_model_id, + recipe_name=training_recipe_name, + activate=activate) \ No newline at end of file diff --git a/scenarios/reporters.py b/scenarios/reporters.py index 0186e74..9de62fe 100644 --- a/scenarios/reporters.py +++ b/scenarios/reporters.py @@ -15,7 +15,6 @@ def add_email_recipients(client=None, project_key=None, scenario_ids=[], recipie if not scenario_ids: print("No scenario id specified, will apply to ALL scenarios") scenario_ids = [scn["id"] for scn in prj.list_scenarios()] - for scn_id in scenario_ids: handle = prj.get_scenario(scn_id) settings = handle.get_settings() From 70f7804ede68d9da3fc4e32424aea99a1b67ca8d Mon Sep 17 00:00:00 2001 From: Jean-Yves Gerardy Date: Fri, 15 Jan 2021 17:52:51 -0500 Subject: [PATCH 13/14] Add scenario and dataset snippets --- datasets/__init__.py | 0 datasets/dataset_utils.py | 16 +++++++++++++ scenarios/partitions.py | 48 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 64 insertions(+) create mode 100644 datasets/__init__.py create mode 100644 datasets/dataset_utils.py create mode 100644 scenarios/partitions.py diff --git a/datasets/__init__.py b/datasets/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/datasets/dataset_utils.py b/datasets/dataset_utils.py new file mode 100644 index 0000000..6e97fa0 --- /dev/null +++ b/datasets/dataset_utils.py @@ -0,0 +1,16 @@ +import dataiku +from datetime import datetime + +def get_last_build_date(client=None, project_key=None, dataset=None): + """Returns a datetime onject representing the last time an output + dataset was built. + Args: + client: A handle on the target DSS instance. + project_key: A string representing the target project key. + dataset: name of dataset, + """ + dataset_info = dataiku.Dataset("test_append").get_files_info() + last_modif = dataset_info.get("globalPaths")[0].get("lastModified") + dt = datetime.fromtimestamp(last_modif/1000) + return dt + diff --git a/scenarios/partitions.py b/scenarios/partitions.py new file mode 100644 index 0000000..6b371cb --- /dev/null +++ b/scenarios/partitions.py @@ -0,0 +1,48 @@ +import dataiku + +def build_all_partitions(scenario=None, + project_key=None, + input_dataset=None, + output_dataset=None): + """Build all output partitions present in an input dataset. + Requires input and output datasets to share the same partitioning + format. + Args: + scenario: A dataiku.scenario.Scenario handle. + project_key: A string representing the target project key. + input_dataset: Name of the input dataset from which + to list all partitions. + output_dataset: String of the name of the dataset to build. + """ + input_dataset = dataiku.Dataset(input_dataset) + partitions = dataset.list_partitions() + partitions_str = ','.join(partitions) + scenario.build_dataset(output_dataset, partitions=partitions_str) + +def build_new_partitions(scenario=None, + project_key=None, + input_dataset=None, + output_dataset=None): + """Build partitions that are present in the input dataset but + not in the output dataset (= new partitions). + Requires input and output datasets to share the same partitioning + format. + Args: + scenario: A dataiku.scenario.Scenario handle. + project_key: A string representing the target project key. + input_dataset: Name of the input dataset from which + to list all partitions. + output_dataset: String of the name of the dataset to build. + """ + input_dataset = dataiku.Dataset(input_dataset) + output_dataset = dataiku.Dataset(output_dataset) + input_partitions = set(input_dataset.list_partitions()) + output_partitions = set(output_dataset.list_partitions()) + new_partitions = input_partitions - output_partitions + partitions_str = ','.join(new_partitions) + scenario.build_dataset(output_dataset, partitions=partitions_str) + + + + + \ No newline at end of file From 027a567d9526953361480bede48d1b55fc409bc4 Mon Sep 17 00:00:00 2001 From: Jean-Yves Gerardy Date: Tue, 19 Jan 2021 15:15:48 -0500 Subject: [PATCH 14/14] Add dynamic partition build --- datasets/dataset_utils.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/datasets/dataset_utils.py b/datasets/dataset_utils.py index 6e97fa0..814d9c6 100644 --- a/datasets/dataset_utils.py +++ b/datasets/dataset_utils.py @@ -14,3 +14,19 @@ def get_last_build_date(client=None, project_key=None, dataset=None): dt = datetime.fromtimestamp(last_modif/1000) return dt +def build_todays_partition(client=None, project_key=None, dataset=None): + """Build parition of today's date in specified dataset. + Return status of build. + Args: + client: A handle on the target DSS instance. + project_key: A string representing the target project key. + dataset: name of dataset, + """ + now = datetime.now() + partition = now.strftime("%Y-%m-%d") + project = client.get_project(project_key) + dataset = project.get_dataset(dataset) + job = dataset.build(partitions=partition) + return job.get_status() + +