[Confluence] Fix pagination for get_all_* methods and unify _get_paged across Cloud/Server (#1616)

Zircoz · claude · web-flow · commit 63e744f89bef · 2026-03-13T11:39:39.000+02:00
* [Confluence] Fix pagination for get_all_* methods and unify _get_paged across Cloud/Server Fixes #1598 Fixes #1480 - Switch 10 get_all_* methods to use _get_paged for full pagination - Unify _get_paged into ConfluenceBase (remove Cloud/Server duplicates) - Handle _links.next as both string and dict formats - Fix relative pagination URLs by prepending base URL correctly - Fix Cloud api_root from wiki/api/v2 to wiki/rest/api (endpoints use v1 paths) - Recognize api.atlassian.com in Cloud detection; support explicit cloud= kwarg - Add routing tests and pagination edge-case tests for both Cloud and Server Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * [Confluence] Fix CodeQL incomplete URL substring sanitization Use urlparse to extract and check the hostname directly instead of naive substring matching, preventing spoofing via paths like evil.com/atlassian.net/... Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * [Confluence] Simplify PR scope: revert unrelated Cloud API changes - Revert Cloud api_version from "latest" back to "2" (original) - Revert Cloud api_root from "wiki/rest/api" back to "wiki/api/v2" (original) - Revert Cloud URL construction: remove api_root suffix appended to self.url - Simplify _get_paged relative URL resolution: drop api_root-stripping branch (was only needed due to the Cloud URL change) and use urlparse(self.url).netloc directly - Update test_init_defaults assertions to match reverted Cloud defaults The Cloud api_version/api_root/URL changes were unrelated to #1598 and constituted a breaking change for existing Cloud users. The complex api_root stripping logic in _get_paged was a direct consequence of that change and is no longer needed. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
diff --git a/atlassian/confluence/__init__.py b/atlassian/confluence/__init__.py
@@ -5,6 +5,8 @@
 This package provides both Cloud and Server implementations of the Confluence API.
 """
 
+from urllib.parse import urlparse
+
 from .cloud import Cloud as ConfluenceCloud
 from .server import Server as ConfluenceServer
 
@@ -18,7 +20,16 @@ class Confluence(ConfluenceBase):
 
     def __init__(self, url, *args, **kwargs):
         # Detect which implementation to use
-        if ("atlassian.net" in url or "jira.com" in url) and ("/wiki" not in url):
+        # Priority: explicit cloud= kwarg > URL-based heuristic
+        is_cloud = kwargs.get("cloud")
+        if is_cloud is None:
+            hostname = urlparse(url).hostname or ""
+            is_cloud = (
+                hostname == "atlassian.net" or hostname.endswith(".atlassian.net")
+                or hostname == "jira.com" or hostname.endswith(".jira.com")
+                or hostname == "api.atlassian.com" or hostname.endswith(".api.atlassian.com")
+            )
+        if is_cloud:
             impl = ConfluenceCloud(url, *args, **kwargs)
         else:
             impl = ConfluenceServer(url, *args, **kwargs)
diff --git a/atlassian/confluence/base.py b/atlassian/confluence/base.py
@@ -1,6 +1,7 @@
 # coding=utf-8
 
 import copy
+from urllib.parse import urlparse
 import logging
 from requests import HTTPError
 from ..rest_client import AtlassianRestAPI
@@ -134,27 +135,27 @@ def _get_paged(
 
             yield from response.get("results", [])
 
-            if self.cloud:
-                url = response.get("_links", {}).get("next", {}).get("href")
-                if url is None:
-                    break
-                # From now on we have absolute URLs with parameters
-                absolute = True
-                # Params are now provided by the url
-                params = {}
-                # Trailing should not be added as it is already part of the url
-                trailing = False
+            next_link = response.get("_links", {}).get("next")
+            if next_link is None:
+                break
+            if isinstance(next_link, str):
+                url = next_link
             else:
-                if response.get("_links", {}).get("next") is None:
-                    break
-                # For server, we need to extract the next page URL from the _links.next.href
-                next_url = response.get("_links", {}).get("next", {}).get("href")
-                if next_url is None:
-                    break
-                url = next_url
-                absolute = True
-                params = {}
-                trailing = False
+                url = next_link.get("href")
+            if url is None:
+                break
+
+            if url.startswith("/"):
+                # Relative URL from Confluence Server: prepend scheme+host from self.url
+                parsed = urlparse(self.url)
+                url = f"{parsed.scheme}://{parsed.netloc}{url}"
+
+            # From now on we have absolute URLs with parameters
+            absolute = True
+            # Params are now provided by the url
+            params = {}
+            # Trailing should not be added as it is already part of the url
+            trailing = False
 
         return
 
diff --git a/atlassian/confluence/cloud/__init__.py b/atlassian/confluence/cloud/__init__.py
@@ -28,6 +28,14 @@ def get_content_by_type(self, content_type, **kwargs):
         """Get content by type (page, blogpost, etc.)."""
         return self.get("content", params={"type": content_type, **kwargs})
 
+    def get_all_pages_from_space(self, space_key, **kwargs):
+        """Get all pages from space."""
+        return self._get_paged("content", params={"spaceKey": space_key, "type": "page", **kwargs})
+
+    def get_all_blog_posts_from_space(self, space_key, **kwargs):
+        """Get all blog posts from space."""
+        return self._get_paged("content", params={"spaceKey": space_key, "type": "blogpost", **kwargs})
+
     def create_content(self, data, **kwargs):
         """Create new content."""
         return self.post("content", data=data, **kwargs)
diff --git a/atlassian/confluence/cloud/base.py b/atlassian/confluence/cloud/base.py
@@ -24,53 +24,4 @@ def __init__(self, url, *args, **kwargs):
         """
         super(ConfluenceCloudBase, self).__init__(url, *args, **kwargs)
 
-    def _get_paged(
-        self,
-        url,
-        params=None,
-        data=None,
-        flags=None,
-        trailing=None,
-        absolute=False,
-    ):
-        """
-        Used to get the paged data for Confluence Cloud
-
-        :param url: string:                        The url to retrieve
-        :param params: dict (default is None):     The parameter's
-        :param data: dict (default is None):       The data
-        :param flags: string[] (default is None):  The flags
-        :param trailing: bool (default is None):   If True, a trailing slash is added to the url
-        :param absolute: bool (default is False):  If True, the url is used absolute and not relative to the root
-
-        :return: A generator object for the data elements
-        """
-        if params is None:
-            params = {}
-
-        while True:
-            response = self.get(
-                url,
-                trailing=trailing,
-                params=params,
-                data=data,
-                flags=flags,
-                absolute=absolute,
-            )
-            if "results" not in response:
-                return
-
-            yield from response.get("results", [])
-
-            # Confluence Cloud uses _links.next.href for pagination
-            url = response.get("_links", {}).get("next", {}).get("href")
-            if url is None:
-                break
-            # From now on we have absolute URLs with parameters
-            absolute = True
-            # Params are now provided by the url
-            params = {}
-            # Trailing should not be added as it is already part of the url
-            trailing = False
 
-        return
diff --git a/atlassian/confluence/server/__init__.py b/atlassian/confluence/server/__init__.py
@@ -62,11 +62,11 @@ def get_content_by_id(self, content_id, **kwargs):
 
     def get_all_pages_from_space(self, space_key, **kwargs):
         """Get all pages from space."""
-        return self.get("content", params={"spaceKey": space_key, "type": "page", **kwargs})
+        return self._get_paged("content", params={"spaceKey": space_key, "type": "page", **kwargs})
 
     def get_all_blog_posts_from_space(self, space_key, **kwargs):
         """Get all blog posts from space."""
-        return self.get("content", params={"spaceKey": space_key, "type": "blogpost", **kwargs})
+        return self._get_paged("content", params={"spaceKey": space_key, "type": "blogpost", **kwargs})
 
     def get_page_by_title(self, space_key, title, **kwargs):
         """Get page by title and space key."""
@@ -195,11 +195,11 @@ def remove_content_label(self, content_id, label_name, **kwargs):
 
     def get_all_pages_by_label(self, label, **kwargs):
         """Get all pages by label."""
-        return self.get("content", params={"label": label, "type": "page", **kwargs})
+        return self._get_paged("content", params={"label": label, "type": "page", **kwargs})
 
     def get_all_blog_posts_by_label(self, label, **kwargs):
         """Get all blog posts by label."""
-        return self.get("content", params={"label": label, "type": "blogpost", **kwargs})
+        return self._get_paged("content", params={"label": label, "type": "blogpost", **kwargs})
 
     # Attachment Management
     def get_attachments(self, content_id, **kwargs):
@@ -293,24 +293,24 @@ def get_draft_content(self, content_id, **kwargs):
 
     def get_all_draft_pages_from_space(self, space_key, **kwargs):
         """Get all draft pages from space."""
-        return self.get("content", params={"spaceKey": space_key, "type": "page", "status": "draft", **kwargs})
+        return self._get_paged("content", params={"spaceKey": space_key, "type": "page", "status": "draft", **kwargs})
 
     def get_all_draft_blog_posts_from_space(self, space_key, **kwargs):
         """Get all draft blog posts from space."""
-        return self.get("content", params={"spaceKey": space_key, "type": "blogpost", "status": "draft", **kwargs})
+        return self._get_paged("content", params={"spaceKey": space_key, "type": "blogpost", "status": "draft", **kwargs})
 
     # Trash Management
     def get_trash_content(self, space_key, **kwargs):
         """Get trash content."""
-        return self.get("content", params={"spaceKey": space_key, "status": "trashed", **kwargs})
+        return self._get_paged("content", params={"spaceKey": space_key, "status": "trashed", **kwargs})
 
     def get_all_pages_from_space_trash(self, space_key, **kwargs):
         """Get all pages from space trash."""
-        return self.get("content", params={"spaceKey": space_key, "type": "page", "status": "trashed", **kwargs})
+        return self._get_paged("content", params={"spaceKey": space_key, "type": "page", "status": "trashed", **kwargs})
 
     def get_all_blog_posts_from_space_trash(self, space_key, **kwargs):
         """Get all blog posts from space trash."""
-        return self.get("content", params={"spaceKey": space_key, "type": "blogpost", "status": "trashed", **kwargs})
+        return self._get_paged("content", params={"spaceKey": space_key, "type": "blogpost", "status": "trashed", **kwargs})
 
     # Export
     def export_content(self, content_id, **kwargs):
diff --git a/atlassian/confluence/server/base.py b/atlassian/confluence/server/base.py
@@ -24,54 +24,4 @@ def __init__(self, url, *args, **kwargs):
         """
         super(ConfluenceServerBase, self).__init__(url, *args, **kwargs)
 
-    def _get_paged(
-        self,
-        url,
-        params=None,
-        data=None,
-        flags=None,
-        trailing=False,
-        absolute=False,
-    ):
-        """
-        Used to get the paged data for Confluence Server
-
-        :param url: string:                        The url to retrieve
-        :param params: dict (default is None):     The parameter's
-        :param data: dict (default is None):       The data
-        :param flags: string[] (default is None):  The flags
-        :param trailing: bool (default is None):   If True, a trailing slash is added to the url
-        :param absolute: bool (default is False):  If True, the url is used absolute and not relative to the root
-
-        :return: A generator object for the data elements
-        """
-        if params is None:
-            params = {}
-
-        while True:
-            response = self.get(
-                url,
-                trailing=trailing,
-                params=params,
-                data=data,
-                flags=flags,
-                absolute=absolute,
-            )
-            if "results" not in response:
-                return
-
-            yield from response.get("results", [])
-
-            # Confluence Server uses _links.next.href for pagination
-            if response.get("_links", {}).get("next") is None:
-                break
-            # For server, we need to extract the next page URL from the _links.next.href
-            next_url = response.get("_links", {}).get("next", {}).get("href")
-            if next_url is None:
-                break
-            url = next_url
-            absolute = True
-            params = {}
-            trailing = False
 
-        return
diff --git a/tests/confluence/test_confluence_cloud.py b/tests/confluence/test_confluence_cloud.py
diff --git a/tests/confluence/test_confluence_routing.py b/tests/confluence/test_confluence_routing.py
diff --git a/tests/confluence/test_confluence_server.py b/tests/confluence/test_confluence_server.py