diff --git a/docs/src/admin/thesauri/thesauri.md b/docs/src/admin/thesauri/thesauri.md index 59f0577105d..74f3996deee 100644 --- a/docs/src/admin/thesauri/thesauri.md +++ b/docs/src/admin/thesauri/thesauri.md @@ -92,6 +92,7 @@ GeoNode provides a single command (``thesaurus``) with multiple actions: * ``list``: list existing thesauri * ``load``: load a RDF file * ``dump``: dump a thesaurus into a file +* ``autoload``: automatically discover and load all thesauri shipped by installed apps .. code-block:: @@ -102,12 +103,13 @@ GeoNode provides a single command (``thesaurus``) with multiple actions: [--format {json-ld,n3,nt,pretty-xml,sorted-xml,trig,ttl,xml}] [--default-lang LANG] [--version] [-v {0,1,2,3}] [--settings SETTINGS] [--pythonpath PYTHONPATH] [--traceback] [--no-color] [--force-color] [--skip-checks] - [{list,load,dump}] + [{list,load,dump,autoload}] - Handles thesaurus commands ['list', 'load', 'dump'] + Handles thesaurus commands ['list', 'load', 'dump', 'autoload'] positional arguments: - {list,load,dump} thesaurus operation to run + {list,load,dump,autoload} + thesaurus operation to run options: -h, --help show this help message and exit @@ -227,6 +229,63 @@ In order to only export the entries we edited, we'll issue the command:: python manage.py thesaurus dump -i labels-i18n --include "proj1_*" --include "*_ovr" -f labels-i18n.proj1.rdf +### Auto-loading thesauri: ``thesaurus autoload`` + +The ``autoload`` subcommand scans every installed Django app for a ``thesauri/`` directory +at the top level of the app package, then loads all ``.rdf`` files it finds there. +This is how GeoNode and third-party apps can ship thesauri that are loaded automatically at start-up. + +```bash +python manage.py thesaurus autoload +``` + +For each ``.rdf`` file discovered, the command runs the equivalent of ``thesaurus load --action update``, +so the operation is **idempotent**: running it multiple times will not create duplicates; instead, +existing records are updated and missing ones are created. + +**Convention for app-provided thesauri** + +Place one or more ``.rdf`` files inside a ``thesauri/`` directory at the root of your app package: + +``` +my_geonode_app/ + thesauri/ + my_vocabulary.rdf + another_vocab.rdf + models.py + ... +``` + +All ``.rdf`` files in that directory are picked up automatically whenever ``thesaurus autoload`` +(or ``invoke loadthesauri``) is executed. + +!!! note + The ``autoload`` command is automatically run during GeoNode's Docker container start-up sequence (see [Initialization at boot](#initialization-at-boot)). + + +## Initialization at boot { #initialization-at-boot } + +When GeoNode starts (e.g. via the Docker entrypoint), the following initialization steps are executed in order: + +1. **Database migrations** – applies any pending schema migrations. +2. **Fixtures** – loads default OAuth2 apps, admin user, and site data (only on first boot or when ``FORCE_REINIT=true``). +3. **Static files** – collects static assets. +4. **Thesauri autoload** – runs ``thesaurus autoload`` to load or update all ``.rdf`` files found in any installed app's ``thesauri/`` directory. This step runs on **every** boot so that thesaurus updates shipped with an upgraded app are applied automatically. + +To run the thesaurus autoload step manually: + +```bash +# Inside the GeoNode container +python manage.py thesaurus autoload +``` + +Or using the invoke task: + +```bash +invoke loadthesauri +``` + + ## Configuring a Thesaurus diff --git a/docs/src/setup/docker/vanilla-docker-installation.md b/docs/src/setup/docker/vanilla-docker-installation.md index 5136c1a2d0d..bef557550a8 100644 --- a/docs/src/setup/docker/vanilla-docker-installation.md +++ b/docs/src/setup/docker/vanilla-docker-installation.md @@ -81,6 +81,15 @@ Executing UWSGI server uwsgi --ini /usr/src/app/uwsgi.ini for Production [uWSGI] getting INI configuration from /usr/src/app/uwsgi.ini ``` +The container performs these initialization steps before starting the application server: + +1. **Database migrations** – applies any pending schema migrations. +2. **Fixtures** – loads default OAuth2 apps, admin user and site data (only on first boot or when ``FORCE_REINIT=true``). +3. **Static files** – collects static assets. +4. **Thesauri autoload** – scans all installed apps for a ``thesauri/`` directory and loads (or updates) any ``.rdf`` files found there. This makes sure thesauri shipped by GeoNode apps are always up-to-date. + +See [Thesauri – Initialization at boot](../../../admin/thesauri/thesauri.md#initialization-at-boot) for more details on the thesaurus autoload step. + To exit just hit `CTRL+C`. This message means that the GeoNode containers have been started. Browsing to `http://localhost/` will show the GeoNode home page. You should be able to successfully log with the credentials of admin user which are defined in the .env file and start using it right away. diff --git a/entrypoint.sh b/entrypoint.sh index ed9469efac2..18c8ab5269f 100755 --- a/entrypoint.sh +++ b/entrypoint.sh @@ -56,6 +56,7 @@ else fi invoke statics + invoke loadthesauri echo "Executing UWSGI server $cmd for Production" fi diff --git a/geonode/base/management/commands/thesaurus.py b/geonode/base/management/commands/thesaurus.py index 95802eaf451..c7cfd9d0c1c 100644 --- a/geonode/base/management/commands/thesaurus.py +++ b/geonode/base/management/commands/thesaurus.py @@ -2,6 +2,7 @@ from django.core.management.base import BaseCommand, CommandError from geonode.base.management.command_utils import setup_logger +from geonode.base.management.commands.thesaurus_subcommands.autoload import autoload_thesauri from geonode.base.management.commands.thesaurus_subcommands.dump import ( dump_thesaurus, DUMP_FORMATS, @@ -16,7 +17,8 @@ COMMAND_LIST = "list" COMMAND_DUMP = "dump" COMMAND_LOAD = "load" -COMMANDS = [COMMAND_LIST, COMMAND_LOAD, COMMAND_DUMP] +COMMAND_AUTOLOAD = "autoload" +COMMANDS = [COMMAND_LIST, COMMAND_LOAD, COMMAND_DUMP, COMMAND_AUTOLOAD] class Command(BaseCommand): @@ -41,6 +43,12 @@ def add_arguments(self, parser): choices=ACTIONS, help="Actions to run upon data loading (default: create)", ) + load_group.add_argument( + "--langs", + dest="langs", + action="append", + help="Only import labels for the requested languages; can be repeated", + ) dump_group = parser.add_argument_group('Params for "dump" subcommand') dump_group.add_argument("-o", "--out", nargs="?", help="Full path to the output file to be created") @@ -99,6 +107,8 @@ def handle(self, *args, **options): input_file = options.get("file") action = options.get("action") identifier = options.get("identifier") + lang = options.get("lang") + langs = options.get("langs") or [] if not input_file: raise CommandError("'load' command requires the parameter.") @@ -107,7 +117,10 @@ def handle(self, *args, **options): action = ACTION_CREATE logger.info(f"Missing action param: setting actions as '{action}'") - load_thesaurus(input_file, identifier, action) + load_thesaurus(input_file, identifier, action, default_lang=lang, langs=langs) + + elif subcommand == COMMAND_AUTOLOAD: + autoload_thesauri() else: raise CommandError(f"Unknown subcommand: {subcommand}") diff --git a/geonode/base/management/commands/thesaurus_subcommands/autoload.py b/geonode/base/management/commands/thesaurus_subcommands/autoload.py new file mode 100644 index 00000000000..df2725d6dfa --- /dev/null +++ b/geonode/base/management/commands/thesaurus_subcommands/autoload.py @@ -0,0 +1,39 @@ +import os + +from django.apps import apps + +from geonode.base.management.command_utils import setup_logger +from geonode.base.management.commands.thesaurus_subcommands.load import load_thesaurus, ACTION_UPDATE + +logger = setup_logger() + + +def autoload_thesauri(): + """ + Discover and load all thesauri (.rdf files) found in a `thesauri/` directory + within each installed Django app. Uses the `update` action so existing entries + are updated and new ones are created without duplicates. + """ + loaded = 0 + for app_config in apps.get_app_configs(): + thesauri_dir = os.path.join(app_config.path, "thesauri") + logger.debug(f"Looking for auto thesaurus in app '{app_config.name}' path: {thesauri_dir}") + if not os.path.isdir(thesauri_dir): + continue + try: + rdf_files = [f for f in os.listdir(thesauri_dir) if f.lower().endswith(".rdf")] + except OSError as e: + logger.error( + f"Failed to scan thesauri directory for app '{app_config.name}' at '{thesauri_dir}': {e}", + exc_info=True, + ) + continue + for rdf_file in sorted(rdf_files): + rdf_path = os.path.join(thesauri_dir, rdf_file) + logger.info(f"Autoloading thesaurus from app '{app_config.name}': {rdf_path}") + try: + load_thesaurus(rdf_path, identifier=None, action=ACTION_UPDATE, log_details=False) + loaded += 1 + except Exception as e: + logger.error(f"Failed to load thesaurus '{rdf_path}': {e}", exc_info=True) + logger.info(f"Autoload complete: {loaded} thesaurus file(s) loaded.") diff --git a/geonode/base/management/commands/thesaurus_subcommands/load.py b/geonode/base/management/commands/thesaurus_subcommands/load.py index 09498010a6a..586ff400077 100644 --- a/geonode/base/management/commands/thesaurus_subcommands/load.py +++ b/geonode/base/management/commands/thesaurus_subcommands/load.py @@ -45,13 +45,16 @@ FAKE_BASE_URI = "http://automatically/added/uri/" -def load_thesaurus(input_file, identifier: str, action: str = ACTION_CREATE): +def load_thesaurus(input_file, identifier: str, action: str = ACTION_CREATE, default_lang: str = None, langs: List[str] = None, log_details=True): g = Graph() # if the input_file is an UploadedFile object rather than a file path the Graph.parse() # method may not have enough info to correctly guess the type; in this case supply the # name, which should include the extension, to guess_format manually... + # explodes list of comma separated langs into single list of langs + langs = [lang.strip() for item in (langs or []) for lang in item.split(",") if lang.strip()] + filename = input_file.name if isinstance(input_file, UploadedFile) else input_file rdf_format = guess_format(filename) if not identifier: @@ -65,7 +68,7 @@ def load_thesaurus(input_file, identifier: str, action: str = ACTION_CREATE): if scheme is None: raise CommandError("ConceptScheme not found in file") - default_lang = getattr(settings, "THESAURUS_DEFAULT_LANG", None) + default_lang = default_lang or getattr(settings, "THESAURUS_DEFAULT_LANG", None) or getattr(settings, "LANGUAGE_CODE", 'en') available_titles = [t for t in itertools.chain(g.objects(scheme, DC.title), g.objects(scheme, DCTERMS.title)) @@ -81,15 +84,22 @@ def load_thesaurus(input_file, identifier: str, action: str = ACTION_CREATE): Thesaurus, {"identifier": identifier}, {"date": date_issued, "description": description, "title": thesaurus_title, "about": str(scheme)}, - {"card_min": 0, "card_max": 0, "facet": False} + {"card_min": 0, "card_max": 0, "facet": False}, + log_details ) tl_cnt = tl_add = 0 tk_cnt = tk_add = 0 tkl_cnt = tkl_add = 0 + tkl_skp = 0 for lang in available_titles: if lang.language is not None: + tl_cnt += 1 + if langs and lang.language not in langs: + logger.debug(f"Skipping label for language '{lang.language}' not in requested langs {langs}") + tkl_skp += 1 + continue thesaurus_label, c = _run_action( action, ThesaurusLabel, @@ -99,8 +109,8 @@ def load_thesaurus(input_file, identifier: str, action: str = ACTION_CREATE): }, {"label": lang.value}, {}, + log_details ) - tl_cnt += 1 tl_add += 1 if c else 0 for concept in g.subjects(RDF.type, SKOS.Concept): @@ -115,7 +125,8 @@ def load_thesaurus(input_file, identifier: str, action: str = ACTION_CREATE): available_labels = [t for t in g.objects(concept, SKOS.prefLabel) if isinstance(t, Literal)] alt_label = value_for_language(available_labels, default_lang) or about - logger.info(f" - Parsed Concept -> about:'{about}' alt:'{alt_label}' pref:'{str(pref)}' ") + if log_details: + logger.info(f" - Parsed Concept -> about:'{about}' alt:'{alt_label}' pref:'{str(pref)}' ") tk, c = _run_action( action, @@ -126,14 +137,21 @@ def load_thesaurus(input_file, identifier: str, action: str = ACTION_CREATE): }, {"alt_label": alt_label}, {}, + log_details ) tk_cnt += 1 tk_add += 1 if c else 0 for _, pref_label in preferredLabel(g, concept): + tkl_cnt += 1 lang = pref_label.language + if langs and lang not in langs: + logger.debug(f"Skipping label for language '{lang}' not in requested langs {langs}") + tkl_skp += 1 + continue label = str(pref_label) - logger.info(f" - Label {lang}: {label}") + if log_details: + logger.info(f" - Label {lang}: {label}") tkl, c = _run_action( action, @@ -144,8 +162,8 @@ def load_thesaurus(input_file, identifier: str, action: str = ACTION_CREATE): }, {"label": label}, {}, + log_details ) - tkl_cnt += 1 tkl_add += 1 if c else 0 logger.warning(f"Thesaurus added: {cr_t}") @@ -154,7 +172,7 @@ def load_thesaurus(input_file, identifier: str, action: str = ACTION_CREATE): logger.warning(f"ThesaurusKeywordLabel added: {tkl_add:3}/{tkl_cnt:3}") -def _run_action(action: str, model: type[models.Model], pk_dict, upd_dict, create_dict) -> tuple[models.Model, bool]: +def _run_action(action: str, model: type[models.Model], pk_dict, upd_dict, create_dict, log_details) -> tuple[models.Model, bool]: def update_or_create(defaults=upd_dict, create_defaults=create_dict, **pk_dict): # this signature is available since django 5 obj, created = model.objects.get_or_create(defaults=upd_dict | create_dict, **pk_dict) @@ -162,7 +180,8 @@ def update_or_create(defaults=upd_dict, create_defaults=create_dict, **pk_dict): if not created: rows = model.objects.filter(pk=obj.pk).update(**upd_dict) if rows != 1: - logger.error(f"UPDATED {rows} rows for {model.__name__} -> {pk_dict}") + if log_details: + logger.error(f"UPDATED {rows} rows for {model.__name__} -> {pk_dict}") return obj, created @@ -176,14 +195,17 @@ def update_or_create(defaults=upd_dict, create_defaults=create_dict, **pk_dict): elif action == ACTION_UPDATE: obj, created = update_or_create(defaults=upd_dict, create_defaults=create_dict, **pk_dict) if created: - logger.info(f"{model.__name__} -> Created id:{pk_dict}") + if log_details: + logger.info(f"{model.__name__} -> Created id:{pk_dict}") else: - logger.info(f"{model.__name__} -> Updated id:{pk_dict} DATA:{upd_dict}") + if log_details: + logger.info(f"{model.__name__} -> Updated id:{pk_dict} DATA:{upd_dict}") elif action == ACTION_APPEND: obj, created = model.objects.get_or_create(defaults=upd_dict | create_dict, **pk_dict) if created: - logger.info(f"{model.__name__} -> Created {pk_dict}") + if log_details: + logger.info(f"{model.__name__} -> Created {pk_dict}") else: raise CommandError("No valid action found") diff --git a/geonode/tests/test_autoload_thesaurus.py b/geonode/tests/test_autoload_thesaurus.py new file mode 100644 index 00000000000..6b9013fc001 --- /dev/null +++ b/geonode/tests/test_autoload_thesaurus.py @@ -0,0 +1,112 @@ +######################################################################### +# +# Copyright (C) 2016 OSGeo +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +######################################################################### + +import os +import shutil +import tempfile +from unittest.mock import patch, MagicMock + +from django.test import TestCase + +from geonode.base.management.commands.thesaurus import autoload_thesauri +from geonode.base.models import Thesaurus + + +RDF_CONTENT = """\ + + + + Autoload Test Thesaurus + 2024-01-01 + + + Concept One + + + +""" + + +class TestAutoloadThesauri(TestCase): + def setUp(self): + self.tmp_dir = tempfile.mkdtemp() + self.thesauri_dir = os.path.join(self.tmp_dir, "thesauri") + os.makedirs(self.thesauri_dir) + self.rdf_file = os.path.join(self.thesauri_dir, "autoload_test.rdf") + with open(self.rdf_file, "w", encoding="utf-8") as f: + f.write(RDF_CONTENT) + + def tearDown(self): + shutil.rmtree(self.tmp_dir, ignore_errors=True) + + def _make_app_config(self, name, path): + app_config = MagicMock() + app_config.name = name + app_config.path = path + return app_config + + def test_autoload_loads_rdf_files_from_thesauri_dirs(self): + """autoload_thesauri should load .rdf files found in thesauri/ dirs of installed apps.""" + app_configs = [self._make_app_config("fake_app", self.tmp_dir)] + with patch("geonode.base.management.commands.thesaurus.apps.get_app_configs", return_value=app_configs): + autoload_thesauri() + + self.assertTrue( + Thesaurus.objects.filter(about="http://example.com/autoload-test-scheme").exists(), + "The thesaurus should have been loaded from the app's thesauri/ directory", + ) + + def test_autoload_is_idempotent(self): + """Calling autoload_thesauri twice should not create duplicate thesauri (uses update action).""" + app_configs = [self._make_app_config("fake_app", self.tmp_dir)] + with patch("geonode.base.management.commands.thesaurus.apps.get_app_configs", return_value=app_configs): + autoload_thesauri() + autoload_thesauri() + + count = Thesaurus.objects.filter(about="http://example.com/autoload-test-scheme").count() + self.assertEqual(1, count, "Running autoload twice should not create duplicate thesauri") + + def test_autoload_skips_apps_without_thesauri_dir(self): + """autoload_thesauri should silently skip apps that have no thesauri/ directory.""" + app_without_thesauri = self._make_app_config("no_thesauri_app", self.tmp_dir.rstrip("/") + "_no_dir") + app_configs = [app_without_thesauri] + with patch("geonode.base.management.commands.thesaurus.apps.get_app_configs", return_value=app_configs): + # Should not raise + autoload_thesauri() + + self.assertEqual(0, Thesaurus.objects.count(), "No thesauri should be loaded when thesauri/ dir is absent") + + def test_autoload_continues_after_error(self): + """autoload_thesauri should continue loading other files if one fails.""" + bad_rdf = os.path.join(self.thesauri_dir, "aaaa_bad.rdf") + with open(bad_rdf, "w") as f: + f.write("THIS IS NOT VALID RDF") + + app_configs = [self._make_app_config("fake_app", self.tmp_dir)] + with patch("geonode.base.management.commands.thesaurus.apps.get_app_configs", return_value=app_configs): + # Should not raise despite the bad file + autoload_thesauri() + + # The valid thesaurus (sorted after the bad one alphabetically: autoload_test > aaaa_bad) should still load + self.assertTrue( + Thesaurus.objects.filter(about="http://example.com/autoload-test-scheme").exists(), + "Valid thesaurus should be loaded even if another file in the same dir fails", + ) diff --git a/tasks.py b/tasks.py index 4f5585ec6aa..86513d3380b 100755 --- a/tasks.py +++ b/tasks.py @@ -392,6 +392,15 @@ def updateadmin(ctx): ) +@task +def loadthesauri(ctx): + print("**************************thesauri*******************************") + ctx.run( + f"python manage.py thesaurus autoload --settings={_localsettings()}", + pty=True, + ) + + @task def collectmetrics(ctx): print("************************collect metrics******************************")