diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..48abe11 --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +venv/ +.vscode/ +__pycache__/ + diff --git a/README.md b/README.md index 8ee7c6e..20111b8 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,50 @@ # dss-code-samples Various code samples for using DSS + +## Refactoring + +### Getting started + +(DSS >= 8.0.3) + +#### Use within DSS (as project library) +- Register in Project Lib Git +- No need to specify remote DSS params +- Profit + +#### Outside of DSS +- Clone repository, tarzip it +- Create virtualenv with dss requirements and tarzipped archive +- Profit ...? + +You can reuse them as they are, customize them for your own needs, and even package them into plugins. + +Create a dedicated virtual environment and install the following packages: +* `dataiku-internal-client`: follow the instructions in the [DSS doc](https://doc.dataiku.com/dss/latest/python-api/outside-usage.html#installing-the-package) +* `dataikuapi`: + ``` + $ pip install dataiku-api-client + ``` +* `pandas`: + ``` + $ pip install "pandas>=1.0,<1.1" + ``` + +### Structure + +``` +dss-code-samples +|_admin +|_applications +|_datasets +|_formulas +|_metrics_and_checks +|_machine_learning +|_partitioning +|_scenarios +|_statistics +|_webapps +``` + + + diff --git a/administration/index-all-hive-databases/README.md b/_old/administration/index-all-hive-databases/README.md similarity index 100% rename from administration/index-all-hive-databases/README.md rename to _old/administration/index-all-hive-databases/README.md diff --git a/administration/index-all-hive-databases/index-all-hive-databases.py b/_old/administration/index-all-hive-databases/index-all-hive-databases.py similarity index 100% rename from administration/index-all-hive-databases/index-all-hive-databases.py rename to _old/administration/index-all-hive-databases/index-all-hive-databases.py diff --git a/client_api_utils/README.md b/_old/client_api_utils/README.md similarity index 100% rename from client_api_utils/README.md rename to _old/client_api_utils/README.md diff --git a/client_api_utils/hive_config_migration/gracefully_provited_hive_legacy_migration.py b/_old/client_api_utils/hive_config_migration/gracefully_provited_hive_legacy_migration.py similarity index 100% rename from client_api_utils/hive_config_migration/gracefully_provited_hive_legacy_migration.py rename to _old/client_api_utils/hive_config_migration/gracefully_provited_hive_legacy_migration.py diff --git a/client_api_utils/project_bundle_deployment/README.md b/_old/client_api_utils/project_bundle_deployment/README.md similarity index 100% rename from client_api_utils/project_bundle_deployment/README.md rename to _old/client_api_utils/project_bundle_deployment/README.md diff --git a/client_api_utils/project_bundle_deployment/manage_bundles.py b/_old/client_api_utils/project_bundle_deployment/manage_bundles.py similarity index 100% rename from client_api_utils/project_bundle_deployment/manage_bundles.py rename to _old/client_api_utils/project_bundle_deployment/manage_bundles.py diff --git a/compute_partition_list/README.md b/_old/compute_partition_list/README.md similarity index 100% rename from compute_partition_list/README.md rename to _old/compute_partition_list/README.md diff --git a/compute_partition_list/compute_partition_list.py b/_old/compute_partition_list/compute_partition_list.py similarity index 100% rename from compute_partition_list/compute_partition_list.py rename to _old/compute_partition_list/compute_partition_list.py diff --git a/custom_python_models/README.md b/_old/custom_python_models/README.md similarity index 100% rename from custom_python_models/README.md rename to _old/custom_python_models/README.md diff --git a/custom_python_models/lightgbm.py b/_old/custom_python_models/lightgbm.py similarity index 100% rename from custom_python_models/lightgbm.py rename to _old/custom_python_models/lightgbm.py diff --git a/dataset_last_run_job_info/README.md b/_old/dataset_last_run_job_info/README.md similarity index 100% rename from dataset_last_run_job_info/README.md rename to _old/dataset_last_run_job_info/README.md diff --git a/dataset_last_run_job_info/get_job_info_for_datasets_in_project.py b/_old/dataset_last_run_job_info/get_job_info_for_datasets_in_project.py similarity index 100% rename from dataset_last_run_job_info/get_job_info_for_datasets_in_project.py rename to _old/dataset_last_run_job_info/get_job_info_for_datasets_in_project.py diff --git a/partition_list_variable/README.md b/_old/partition_list_variable/README.md similarity index 100% rename from partition_list_variable/README.md rename to _old/partition_list_variable/README.md diff --git a/partition_list_variable/build_only_new_partitions.py b/_old/partition_list_variable/build_only_new_partitions.py similarity index 100% rename from partition_list_variable/build_only_new_partitions.py rename to _old/partition_list_variable/build_only_new_partitions.py diff --git a/partition_list_variable/build_whole_output_custom.py b/_old/partition_list_variable/build_whole_output_custom.py similarity index 100% rename from partition_list_variable/build_whole_output_custom.py rename to _old/partition_list_variable/build_whole_output_custom.py diff --git a/python_io_examples/README.md b/_old/python_io_examples/README.md similarity index 100% rename from python_io_examples/README.md rename to _old/python_io_examples/README.md diff --git a/python_io_examples/pandas_chunked_read_write.py b/_old/python_io_examples/pandas_chunked_read_write.py similarity index 100% rename from python_io_examples/pandas_chunked_read_write.py rename to _old/python_io_examples/pandas_chunked_read_write.py diff --git a/reco/README.md b/_old/reco/README.md similarity index 100% rename from reco/README.md rename to _old/reco/README.md diff --git a/reco/__init__.py b/_old/reco/__init__.py similarity index 100% rename from reco/__init__.py rename to _old/reco/__init__.py diff --git a/reco/surprise_wrapper.py b/_old/reco/surprise_wrapper.py similarity index 100% rename from reco/surprise_wrapper.py rename to _old/reco/surprise_wrapper.py diff --git a/visualization/flask-webapps/authenticate-calls/README.md b/_old/visualization/flask-webapps/authenticate-calls/README.md similarity index 100% rename from visualization/flask-webapps/authenticate-calls/README.md rename to _old/visualization/flask-webapps/authenticate-calls/README.md diff --git a/visualization/flask-webapps/authenticate-calls/app.js b/_old/visualization/flask-webapps/authenticate-calls/app.js similarity index 100% rename from visualization/flask-webapps/authenticate-calls/app.js rename to _old/visualization/flask-webapps/authenticate-calls/app.js diff --git a/visualization/flask-webapps/authenticate-calls/backend.py b/_old/visualization/flask-webapps/authenticate-calls/backend.py similarity index 100% rename from visualization/flask-webapps/authenticate-calls/backend.py rename to _old/visualization/flask-webapps/authenticate-calls/backend.py diff --git a/visualization/flask-webapps/flask-session-per-browser/README.md b/_old/visualization/flask-webapps/flask-session-per-browser/README.md similarity index 100% rename from visualization/flask-webapps/flask-session-per-browser/README.md rename to _old/visualization/flask-webapps/flask-session-per-browser/README.md diff --git a/visualization/flask-webapps/flask-session-per-browser/app.js b/_old/visualization/flask-webapps/flask-session-per-browser/app.js similarity index 100% rename from visualization/flask-webapps/flask-session-per-browser/app.js rename to _old/visualization/flask-webapps/flask-session-per-browser/app.js diff --git a/visualization/flask-webapps/flask-session-per-browser/backend.py b/_old/visualization/flask-webapps/flask-session-per-browser/backend.py similarity index 100% rename from visualization/flask-webapps/flask-session-per-browser/backend.py rename to _old/visualization/flask-webapps/flask-session-per-browser/backend.py diff --git a/visualization/flask-webapps/flask-session-per-browser/body.html b/_old/visualization/flask-webapps/flask-session-per-browser/body.html similarity index 100% rename from visualization/flask-webapps/flask-session-per-browser/body.html rename to _old/visualization/flask-webapps/flask-session-per-browser/body.html diff --git a/visualization/flask-webapps/flask-session-per-frontend/README.md b/_old/visualization/flask-webapps/flask-session-per-frontend/README.md similarity index 100% rename from visualization/flask-webapps/flask-session-per-frontend/README.md rename to _old/visualization/flask-webapps/flask-session-per-frontend/README.md diff --git a/visualization/flask-webapps/flask-session-per-frontend/app.js b/_old/visualization/flask-webapps/flask-session-per-frontend/app.js similarity index 100% rename from visualization/flask-webapps/flask-session-per-frontend/app.js rename to _old/visualization/flask-webapps/flask-session-per-frontend/app.js diff --git a/visualization/flask-webapps/flask-session-per-frontend/backend.py b/_old/visualization/flask-webapps/flask-session-per-frontend/backend.py similarity index 100% rename from visualization/flask-webapps/flask-session-per-frontend/backend.py rename to _old/visualization/flask-webapps/flask-session-per-frontend/backend.py diff --git a/visualization/flask-webapps/flask-session-per-frontend/body.html b/_old/visualization/flask-webapps/flask-session-per-frontend/body.html similarity index 100% rename from visualization/flask-webapps/flask-session-per-frontend/body.html rename to _old/visualization/flask-webapps/flask-session-per-frontend/body.html diff --git a/visualization/flask-webapps/flask-session-per-user/README.md b/_old/visualization/flask-webapps/flask-session-per-user/README.md similarity index 100% rename from visualization/flask-webapps/flask-session-per-user/README.md rename to _old/visualization/flask-webapps/flask-session-per-user/README.md diff --git a/visualization/flask-webapps/flask-session-per-user/app.js b/_old/visualization/flask-webapps/flask-session-per-user/app.js similarity index 100% rename from visualization/flask-webapps/flask-session-per-user/app.js rename to _old/visualization/flask-webapps/flask-session-per-user/app.js diff --git a/visualization/flask-webapps/flask-session-per-user/backend.py b/_old/visualization/flask-webapps/flask-session-per-user/backend.py similarity index 100% rename from visualization/flask-webapps/flask-session-per-user/backend.py rename to _old/visualization/flask-webapps/flask-session-per-user/backend.py diff --git a/visualization/flask-webapps/flask-session-per-user/body.html b/_old/visualization/flask-webapps/flask-session-per-user/body.html similarity index 100% rename from visualization/flask-webapps/flask-session-per-user/body.html rename to _old/visualization/flask-webapps/flask-session-per-user/body.html diff --git a/visualization/shiny/authenticate-calls/README.md b/_old/visualization/shiny/authenticate-calls/README.md similarity index 100% rename from visualization/shiny/authenticate-calls/README.md rename to _old/visualization/shiny/authenticate-calls/README.md diff --git a/visualization/shiny/authenticate-calls/server.R b/_old/visualization/shiny/authenticate-calls/server.R similarity index 100% rename from visualization/shiny/authenticate-calls/server.R rename to _old/visualization/shiny/authenticate-calls/server.R diff --git a/visualization/shiny/authenticate-calls/ui.R b/_old/visualization/shiny/authenticate-calls/ui.R similarity index 100% rename from visualization/shiny/authenticate-calls/ui.R rename to _old/visualization/shiny/authenticate-calls/ui.R diff --git a/visualization/shiny/shiny-and-dygraphs/README.md b/_old/visualization/shiny/shiny-and-dygraphs/README.md similarity index 100% rename from visualization/shiny/shiny-and-dygraphs/README.md rename to _old/visualization/shiny/shiny-and-dygraphs/README.md diff --git a/visualization/shiny/shiny-and-dygraphs/UI.R b/_old/visualization/shiny/shiny-and-dygraphs/UI.R similarity index 100% rename from visualization/shiny/shiny-and-dygraphs/UI.R rename to _old/visualization/shiny/shiny-and-dygraphs/UI.R diff --git a/visualization/shiny/shiny-and-dygraphs/server.R b/_old/visualization/shiny/shiny-and-dygraphs/server.R similarity index 100% rename from visualization/shiny/shiny-and-dygraphs/server.R rename to _old/visualization/shiny/shiny-and-dygraphs/server.R diff --git a/admin/__init__.py b/admin/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/admin/job_utils.py b/admin/job_utils.py new file mode 100644 index 0000000..0939cbb --- /dev/null +++ b/admin/job_utils.py @@ -0,0 +1,64 @@ +import dataiku +from datetime import datetime + + +def list_jobs_by_status(client=None, project_key=None): + """List jobs by current status in a given project. + + Args: + client: A handle on the target DSS instance + project_key: A string representing the target project key + + Returns: + jobs_by_status: A dict of lists mapping jobs and their states + """ + + project = client.get_project(project_key) + jobs_by_status = {"RUNNING": [], + "FAILED": [], + "DONE": [], + "ABORTED": []} + for job in project.list_jobs(): + if not job["stableState"]: + jobs_by_status["RUNNING"].append(job) + else: + jobs_by_status[job["state"]].append(job) + return jobs_by_status + + +def filter_jobs_by_start_date(client=None, project_key=None, start_date=None): + """List jobs that were started after a specific date. + + Args: + client: A handle on the target DSS instance + project_key: A string representing the target project key + start_date: A string of the form 'YYYY/mm/dd' + + Returns: + jobs_after_start_date: A dict of lists mapping jobs and their states + + """ + jobs_by_status = list_jobs_by_status(client, project_key) + start_date_timestamp = int(datetime.strptime(start_date, "%Y/%m/%d").strftime("%s")) * 1000 + is_after_start_date = lambda x, d: x["def"]["initiationTimestamp"] > d + jobs_after_start_date = {_status: [job for job in _list if is_after_start_date(job, start_date_timestamp)] for _status, _list in jobs_by_status.items()} + return jobs_after_start_date + + +def abort_all_running_jobs(client=None, project_key=None): + """Terminate all running jobs in a project. + + Args: + client: A handle on the target DSS instance + project_key: A string representing the target project key + """ + + project = client.get_project(project_key) + aborted_jobs = [] + for job in project.list_jobs(): + if not job["stableState"]: + job_id = job["def"]["id"] + aborted_jobs.append(job_id) + project.get_job(job_id).abort() + print(f"Deleted {len(aborted_jobs)} running jobs") + diff --git a/admin/project_utils.py b/admin/project_utils.py new file mode 100644 index 0000000..8187901 --- /dev/null +++ b/admin/project_utils.py @@ -0,0 +1,31 @@ +import dataiku + +def edit_project_permissions(client=None, project_key=None, group=None, perms=None, revoke=False): + """Grant or revoke project permissions for a given group. + + Args: + client: A handle on the target DSS instance + project_key: A string representing the target project key + group: A string representing the target group name + perms: A list of permissions to grant + revoke: A boolean for completely revoking access to the project + """ + + prj = client.get_project(project_key) + perm_obj = prj.get_permissions() + perm_list = perm_obj["permissions"] + for p in perm_list: + if p["group"] == group: + print("Deleting existing permissions...") + perm_list.remove(p) + if revoke: + perm_obj["permissions"] = perm_list + print(f"Revoking all permissions on project {project_key} for group {group}") + else: + if not perms: + print("Missing permission list, will grant ADMIN instead...") + perms = ["admin"] + new_group_perms = dict({"group": group}, **{p: True for p in perms}) + perm_obj["permissions"].append(new_group_perms) + print(f"Granting {perms} to group {group} on project {project_key}...") + prj.set_permissions(perm_obj) diff --git a/admin/spark_utils.py b/admin/spark_utils.py new file mode 100644 index 0000000..d7b5a10 --- /dev/null +++ b/admin/spark_utils.py @@ -0,0 +1,6 @@ +import dataiku + +def add_spark_config(client=None, config=None): + return NotImplementedError + + diff --git a/datasets/README.md b/datasets/README.md new file mode 100644 index 0000000..9043a51 --- /dev/null +++ b/datasets/README.md @@ -0,0 +1,12 @@ +# Datasets + +## TODO + +- [ ] Programmatically build partitions +- [ ] Retrieve last build date (PR #3) +- [ ] Read from/write to non-local-FS-folders +- [ ] Run containerized execution with input/output data in managed folders +- [ ] Flag and delete orphaned datasets +- [ ] Schema propagation from updated dataset +- [ ] Create "Upload" dataset and add/replace file(s) + diff --git a/datasets/__init__.py b/datasets/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/datasets/dataset_utils.py b/datasets/dataset_utils.py new file mode 100644 index 0000000..814d9c6 --- /dev/null +++ b/datasets/dataset_utils.py @@ -0,0 +1,32 @@ +import dataiku +from datetime import datetime + +def get_last_build_date(client=None, project_key=None, dataset=None): + """Returns a datetime onject representing the last time an output + dataset was built. + Args: + client: A handle on the target DSS instance. + project_key: A string representing the target project key. + dataset: name of dataset, + """ + dataset_info = dataiku.Dataset("test_append").get_files_info() + last_modif = dataset_info.get("globalPaths")[0].get("lastModified") + dt = datetime.fromtimestamp(last_modif/1000) + return dt + +def build_todays_partition(client=None, project_key=None, dataset=None): + """Build parition of today's date in specified dataset. + Return status of build. + Args: + client: A handle on the target DSS instance. + project_key: A string representing the target project key. + dataset: name of dataset, + """ + now = datetime.now() + partition = now.strftime("%Y-%m-%d") + project = client.get_project(project_key) + dataset = project.get_dataset(dataset) + job = dataset.build(partitions=partition) + return job.get_status() + + diff --git a/deployer/README.md b/deployer/README.md new file mode 100644 index 0000000..b15d067 --- /dev/null +++ b/deployer/README.md @@ -0,0 +1,3 @@ +- Keep a database connection persistent to speed up response time by opening a connection pool outside of the the API function +- Image classification scoring on a custom deep learning model +- \ No newline at end of file diff --git a/machine_learning/mltask_utils.py b/machine_learning/mltask_utils.py new file mode 100644 index 0000000..e5e062b --- /dev/null +++ b/machine_learning/mltask_utils.py @@ -0,0 +1,89 @@ +import dataiku + +def get_best_model(client=None, + project_key=None, + analysis_id=None, + ml_task_id=None, + metric=None): + """Return the 'best model' (according to the input metric) of a ML task. + + Args: + client: A handle on the DSS instance + project_key: A string representing the target project key + analysis_id: A string linking to the target visual analysis. + Can be found in the analysis URL or via + dataikuapi.dss.project.DSSProject.list_analyses() + ml_task_id: A string linking to the target MLTask in a given analysis. + Can be found in the ML task URL or via + dataikuapi.dss.analysis.DSSAnalysis.list_ml_tasks() + metric: A string defining which metric to use for performance ranking + + Returns: + ml_task: A handle to interact with the ML task. + Useful when (re)deploying the model. + best_model_snippet: A string containing the ID of the ML task's 'best model' + + """ + prj = client.get_project(project_key) + analysis = prj.get_analysis(analysis_id) + ml_task = analysis.get_ml_task(ml_task_id) + trained_models = ml_task.get_trained_models_ids() + trained_models_snippets = [ml_task.get_trained_model_snippet(m) for m in trained_models] + # Assumes that for your metric, "higher is better" + best_model_snippet = max(trained_models_snippets, key=lambda x:x[metric]) + best_model_id = best_model_snippet["fullModelId"] + return ml_task, best_model_id + + +def deploy_with_best_model(client=None, + project_key=None, + analysis_id=None, + ml_task_id=None, + metric=None, + saved_model_name=None, + training_dataset=None): + """Create a new Saved Model in the Flow with the 'best model' of a ML task. + + Args: + client: A handle on the DSS instance + project_key: A string representing the target project key. + analysis_id: A string linking to the target visual analysis. + Can be found in the analysis URL or via + dataikuapi.dss.project.DSSProject.list_analyses(). + ml_task_id: A string linking to the target MLTask in a given analysis. + Can be found in the ML task URL or via + dataikuapi.dss.analysis.DSSAnalysis.list_ml_tasks(). + metric: A string defining which metric to use for performance ranking. + saved_model_name: A string to name the newly-created Saved Model. + training_dataset: A string representing the name of the dataset + used as train set. + + """ + ml_task, best_model_id = get_best_model(client, + project_key, + analysis_id, + ml_task_id, + metric) + ml_task.deploy_to_flow(best_model_id, + saved_model_name, + training_dataset) + +def update_with_best_model(client=None, + project_key=None, + analysis_id=None, + ml_task_id=None, + metric=None, + saved_model_name=None, + activate=True): + """Update an existing Saved Model in the Flow with the 'best model' + of a ML task. + """ + ml_task, best_model_id = get_best_model(client, + project_key, + analysis_id, + ml_task_id, + metric) + training_recipe_name = f"train_{saved_model_name}" + ml_task.redeploy_to_flow(model_id=best_model_id, + recipe_name=training_recipe_name, + activate=activate) \ No newline at end of file diff --git a/machine_learning/saved_model_utils.py b/machine_learning/saved_model_utils.py new file mode 100644 index 0000000..00ef12d --- /dev/null +++ b/machine_learning/saved_model_utils.py @@ -0,0 +1,30 @@ +import dataiku + +def explore_saved_models(client=None, project_key=None): + """List saved models of a project and give details on the active versions. + + Args: + client: A handle on the target DSS instance + project_key: A string representing the target project key + + Returns: + smdl_list: A dict with all saved model ids and perf + algorithm + for the active versions. + + """ + smdl_list = [] + prj = client.get_project(project_key) + smdl_ids = [x["id"] for x in prj.list_saved_models()] + for smdl in smdl_ids: + data = {} + obj = prj.get_saved_model(smdl) + data["version_ids"] = [m["id"] for m in obj.list_versions()] + active_version_id = obj.get_active_version()["id"] + active_version_details = obj.get_version_details(active_version_id) + data["active_version"] = {"id": active_version_id, + "algorithm": active_version_details.details["actualParams"]["resolved"]["algorithm"], + "performance_metrics": active_version_details.get_performance_metrics()} + smdl_list.append(data) + return smdl_list + + diff --git a/metrics_and_checks/README.md b/metrics_and_checks/README.md new file mode 100644 index 0000000..cc00c68 --- /dev/null +++ b/metrics_and_checks/README.md @@ -0,0 +1,4 @@ +# Metrics and checks + +- [ ] Retrieve metrics history of a dataset +- [ ] Retrieve metrics history of a model diff --git a/projects/README.md b/projects/README.md new file mode 100644 index 0000000..f1ccdbb --- /dev/null +++ b/projects/README.md @@ -0,0 +1,27 @@ +- Build all + ```python + client = dataiku.api_client() + project = client.get_project(dataiku.default_project_key()) + flow = project.get_flow() + graph = flow.get_graph() + for k,v in graph.data.get('nodes').items(): + if v.get('successors') == []: + definition = { + "type" : 'RECURSIVE_BUILD', + "outputs" : [{"id": k}] + } + print('Building dataset {}'.format(k)) + job = project.start_job(definition) + ``` + Will need adjustments if there are saved models. + +- Build specific tags only +- Build specific zones only +- Detect schema changes on a dataset and propagate them + ```python + settings = dataset.get_settings() + settings.get_raw()["schema"] = {"columns":[]} + settings.save() + new_settings = dataset.autodetect_settings() + new_settings.save() +``` diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..63b9507 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,13 @@ +certifi==2020.6.20 +chardet==3.0.4 +idna==2.8 +numpy==1.19.4 +pandas==1.0.5 +python-dateutil==2.8.0 +pytz==2019.2 +requests==2.22.0 +six==1.15.0 +urllib3==1.25.11 +dataiku-api-client==8.0.0 +http://localhost:40000/public/packages/dataiku-internal-client.tar.gz + diff --git a/scenarios/README.md b/scenarios/README.md new file mode 100644 index 0000000..43de12f --- /dev/null +++ b/scenarios/README.md @@ -0,0 +1,8 @@ +# Scenarios + +- [ ] Implement a timeout for a particular scenario step + +- View all the "run after scenario" dependences between projects +> you'll need to write some code using the public API to loop over the scenario settings, look for follow_scenariorun triggers, and build the dependency tree yourself + + diff --git a/scenarios/partitions.py b/scenarios/partitions.py new file mode 100644 index 0000000..6b371cb --- /dev/null +++ b/scenarios/partitions.py @@ -0,0 +1,48 @@ +import dataiku + +def build_all_partitions(scenario=None, + project_key=None, + input_dataset=None, + output_dataset=None): + """Build all output partitions present in an input dataset. + Requires input and output datasets to share the same partitioning + format. + Args: + scenario: A dataiku.scenario.Scenario handle. + project_key: A string representing the target project key. + input_dataset: Name of the input dataset from which + to list all partitions. + output_dataset: String of the name of the dataset to build. + """ + input_dataset = dataiku.Dataset(input_dataset) + partitions = dataset.list_partitions() + partitions_str = ','.join(partitions) + scenario.build_dataset(output_dataset, partitions=partitions_str) + +def build_new_partitions(scenario=None, + project_key=None, + input_dataset=None, + output_dataset=None): + """Build partitions that are present in the input dataset but + not in the output dataset (= new partitions). + Requires input and output datasets to share the same partitioning + format. + Args: + scenario: A dataiku.scenario.Scenario handle. + project_key: A string representing the target project key. + input_dataset: Name of the input dataset from which + to list all partitions. + output_dataset: String of the name of the dataset to build. + """ + input_dataset = dataiku.Dataset(input_dataset) + output_dataset = dataiku.Dataset(output_dataset) + input_partitions = set(input_dataset.list_partitions()) + output_partitions = set(output_dataset.list_partitions()) + new_partitions = input_partitions - output_partitions + partitions_str = ','.join(new_partitions) + scenario.build_dataset(output_dataset, partitions=partitions_str) + + + + + \ No newline at end of file diff --git a/scenarios/reporters.py b/scenarios/reporters.py new file mode 100644 index 0000000..9de62fe --- /dev/null +++ b/scenarios/reporters.py @@ -0,0 +1,32 @@ +import dataiku + +def add_email_recipients(client=None, project_key=None, scenario_ids=[], recipients=[]): + """Append additional recipients to scenario email reporters. + + Args: + client: A handle on the target DSS instance + project_key: A string representing the target project key + scenario_ids: A list of scenario ID strings + recipients: A list of email address strings + + """ + + prj = client.get_project(project_key) + if not scenario_ids: + print("No scenario id specified, will apply to ALL scenarios") + scenario_ids = [scn["id"] for scn in prj.list_scenarios()] + for scn_id in scenario_ids: + handle = prj.get_scenario(scn_id) + settings = handle.get_settings() + reporters = settings.raw_reporters + if not reporters: + print("No reporter found.") + else: + for rep in reporters: + if rep["messaging"]["type"] == "mail-scenario": + if rep["messaging"]["configuration"]["recipient"]: + sep = ', ' + else: + sep = '' + rep["messaging"]["configuration"]["recipient"] += (sep + ', '.join(recipients)) + settings.save() diff --git a/statistics/README.md b/statistics/README.md new file mode 100644 index 0000000..54f19e4 --- /dev/null +++ b/statistics/README.md @@ -0,0 +1,3 @@ +# Interactive statistics + +- [ ] Retrieve correlation matrix from dataset/worksheet/card diff --git a/webapps/README.md b/webapps/README.md new file mode 100644 index 0000000..990c06c --- /dev/null +++ b/webapps/README.md @@ -0,0 +1,8 @@ +# Webapps + +- [ ] Helpers (REST) to start/stop/list webapp backends +- [ ] Authenticate users on a Flask webapp +- [ ] Maintaining a model per browser/frontend/user on a Flask webapp +- [ ] Authenticate users on a Shiny webapp +- [ ] Display interactive time series in a Shiny webapp +- [ ] Display interactive time series in a Bokeh webapp