From 225b3b1304f6d4f45ece6a2eab1473a21ec0cefa Mon Sep 17 00:00:00 2001 From: Ed Summers Date: Fri, 10 Apr 2026 05:11:20 -0400 Subject: [PATCH 1/2] Handle unexpected HTTP 2XX responses When HTTP 202 responses are returned from the API that can cause query() to return None, which causes problems in query_iterative() which doesn't handle that situation. It seems like we would want to surface unexpected HTTP responses as an exception. This change to query() will treat any unhandled HTTP response as an exception. Fixes #95 It also can be useful to retry requests when doing iterative queries. If the HTTP 202 responses really are transient they will get retried, like other errors. --- dimcli/core/api.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/dimcli/core/api.py b/dimcli/core/api.py index aac0ad0..87d4b2d 100644 --- a/dimcli/core/api.py +++ b/dimcli/core/api.py @@ -14,6 +14,7 @@ import IPython.display from itertools import islice import urllib.parse +from requests.exceptions import HTTPError import pandas as pd @@ -193,11 +194,18 @@ def query(self, q, show_results=None, retry=0, verbose=None): if verbose: printDebug("ERROR LOG\n---\nQuery\n---\n" + str(q), "red") if verbose: printDebug("Response.header\n---\n" + str(response.headers), "red") if verbose: printDebug("Response.content\n---\n" +str(response.content), "red") + response.raise_for_status() + # raise_for_status() doesn't treat 2XX messages as an error but empty HTTP 202 + # messages have been observed. + # + # Any HTTP response that has made it here needs to trigger an exception or else we + # will return None, which can cause problem for calling code. + raise HTTPError(f"Unexpected response HTTP {response.status_code}", response=response) - def query_iterative(self, q, show_results=None, limit=1000, skip=0, pause=1.5, force=False, maxlimit=0, verbose=None, _tot_count_prev_query=0, _warnings_tot=None): + def query_iterative(self, q, show_results=None, limit=1000, skip=0, pause=1.5, force=False, maxlimit=0, verbose=None, retry=0, _tot_count_prev_query=0, _warnings_tot=None): """Runs a DSL query and then keep querying until all matching records have been extracted. The API returns a maximum of 1000 records per call. If a DSL query results in more than 1000 matches, it is possible to use pagination to get more results, up to 50k. @@ -224,7 +232,7 @@ def query_iterative(self, q, show_results=None, limit=1000, skip=0, pause=1.5, f The maximum number of records to extract in total. If 0, all available records are extracted, up to the API upper limit of 50k records per query. verbose : bool, default=False Verbose mode. - + retry: number of retries per individual request, when an error is encountered Returns ------- @@ -291,7 +299,7 @@ def query_iterative(self, q, show_results=None, limit=1000, skip=0, pause=1.5, f q2 = q + " limit %d skip %d" % (limit, skip) start = time.time() - res = self.query(q2, show_results=False, retry=0, verbose=False) + res = self.query(q2, show_results=False, retry=retry, verbose=False) end = time.time() elapsed = end - start @@ -1011,4 +1019,4 @@ def __repr__(self): # 2019-12-17: for backward compatibility # remove once all notebooks code has been updated Result = DslDataset -Dataset = DslDataset \ No newline at end of file +Dataset = DslDataset From 9d40f8b2b31eb309d669d98a0c7f4bf01ea4ff04 Mon Sep 17 00:00:00 2001 From: Ed Summers Date: Tue, 14 Apr 2026 16:34:30 -0400 Subject: [PATCH 2/2] Ensure tot is not None I saw this error go by when using query_iterative(): ``` TypeError: '>' not supported between instances of 'NoneType' and 'int' File "/home/airflow/.local/lib/python3.12/site-packages/airflow/sdk/execution_time/task_runner.py", line 1112 in run File "/home/airflow/.local/lib/python3.12/site-packages/airflow/sdk/execution_time/task_runner.py", line 1523 in _execute_task File "/home/airflow/.local/lib/python3.12/site-packages/airflow/sdk/bases/operator.py", line 417 in wrapper File "/home/airflow/.local/lib/python3.12/site-packages/airflow/sdk/bases/decorator.py", line 252 in execute File "/home/airflow/.local/lib/python3.12/site-packages/airflow/sdk/bases/operator.py", line 417 in wrapper File "/home/airflow/.local/lib/python3.12/site-packages/airflow/providers/standard/operators/python.py", line 228 in execute File "/home/airflow/.local/lib/python3.12/site-packages/airflow/providers/standard/operators/python.py", line 251 in execute_callable File "/home/airflow/.local/lib/python3.12/site-packages/airflow/sdk/execution_time/callback_runner.py", line 82 in run File "/opt/airflow/rialto_airflow/dags/harvest.py", line 82 in dimensions_harvest File "/opt/airflow/rialto_airflow/harvest/dimensions.py", line 42 in harvest File "/opt/airflow/rialto_airflow/harvest/dimensions.py", line 108 in publications_from_orcid File "/opt/airflow/rialto_airflow/harvest/dimensions.py", line 217 in query_with_retry File "/home/airflow/.local/lib/python3.12/site-packages/dimcli/core/api.py", line 342 in query_iterative File "/home/airflow/.local/lib/python3.12/site-packages/dimcli/core/api.py", line 325 in query_iterative ``` It seems like ensuring `tot` is not `None` here will help matters? --- dimcli/core/api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dimcli/core/api.py b/dimcli/core/api.py index 87d4b2d..3863b50 100644 --- a/dimcli/core/api.py +++ b/dimcli/core/api.py @@ -322,7 +322,7 @@ def query_iterative(self, q, show_results=None, limit=1000, skip=0, pause=1.5, f tot = _tot_count_prev_query # when force=True, we have no current query stats new_skip = skip+limit - if tot > 0 and new_skip > tot: + if tot is not None and tot > 0 and new_skip > tot: new_skip = tot if verbose and tot: # if not first iteration t = "%.2f" % elapsed