Skip to content

Commit 63e744f

Browse files
Zircozclaude
andauthored
[Confluence] Fix pagination for get_all_* methods and unify _get_paged across Cloud/Server (#1616)
* [Confluence] Fix pagination for get_all_* methods and unify _get_paged across Cloud/Server Fixes #1598 Fixes #1480 - Switch 10 get_all_* methods to use _get_paged for full pagination - Unify _get_paged into ConfluenceBase (remove Cloud/Server duplicates) - Handle _links.next as both string and dict formats - Fix relative pagination URLs by prepending base URL correctly - Fix Cloud api_root from wiki/api/v2 to wiki/rest/api (endpoints use v1 paths) - Recognize api.atlassian.com in Cloud detection; support explicit cloud= kwarg - Add routing tests and pagination edge-case tests for both Cloud and Server Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * [Confluence] Fix CodeQL incomplete URL substring sanitization Use urlparse to extract and check the hostname directly instead of naive substring matching, preventing spoofing via paths like evil.com/atlassian.net/... Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * [Confluence] Simplify PR scope: revert unrelated Cloud API changes - Revert Cloud api_version from "latest" back to "2" (original) - Revert Cloud api_root from "wiki/rest/api" back to "wiki/api/v2" (original) - Revert Cloud URL construction: remove api_root suffix appended to self.url - Simplify _get_paged relative URL resolution: drop api_root-stripping branch (was only needed due to the Cloud URL change) and use urlparse(self.url).netloc directly - Update test_init_defaults assertions to match reverted Cloud defaults The Cloud api_version/api_root/URL changes were unrelated to #1598 and constituted a breaking change for existing Cloud users. The complex api_root stripping logic in _get_paged was a direct consequence of that change and is no longer needed. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
1 parent c924704 commit 63e744f

File tree

9 files changed

+389
-147
lines changed

9 files changed

+389
-147
lines changed

atlassian/confluence/__init__.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
This package provides both Cloud and Server implementations of the Confluence API.
66
"""
77

8+
from urllib.parse import urlparse
9+
810
from .cloud import Cloud as ConfluenceCloud
911
from .server import Server as ConfluenceServer
1012

@@ -18,7 +20,16 @@ class Confluence(ConfluenceBase):
1820

1921
def __init__(self, url, *args, **kwargs):
2022
# Detect which implementation to use
21-
if ("atlassian.net" in url or "jira.com" in url) and ("/wiki" not in url):
23+
# Priority: explicit cloud= kwarg > URL-based heuristic
24+
is_cloud = kwargs.get("cloud")
25+
if is_cloud is None:
26+
hostname = urlparse(url).hostname or ""
27+
is_cloud = (
28+
hostname == "atlassian.net" or hostname.endswith(".atlassian.net")
29+
or hostname == "jira.com" or hostname.endswith(".jira.com")
30+
or hostname == "api.atlassian.com" or hostname.endswith(".api.atlassian.com")
31+
)
32+
if is_cloud:
2233
impl = ConfluenceCloud(url, *args, **kwargs)
2334
else:
2435
impl = ConfluenceServer(url, *args, **kwargs)

atlassian/confluence/base.py

Lines changed: 21 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# coding=utf-8
22

33
import copy
4+
from urllib.parse import urlparse
45
import logging
56
from requests import HTTPError
67
from ..rest_client import AtlassianRestAPI
@@ -134,27 +135,27 @@ def _get_paged(
134135

135136
yield from response.get("results", [])
136137

137-
if self.cloud:
138-
url = response.get("_links", {}).get("next", {}).get("href")
139-
if url is None:
140-
break
141-
# From now on we have absolute URLs with parameters
142-
absolute = True
143-
# Params are now provided by the url
144-
params = {}
145-
# Trailing should not be added as it is already part of the url
146-
trailing = False
138+
next_link = response.get("_links", {}).get("next")
139+
if next_link is None:
140+
break
141+
if isinstance(next_link, str):
142+
url = next_link
147143
else:
148-
if response.get("_links", {}).get("next") is None:
149-
break
150-
# For server, we need to extract the next page URL from the _links.next.href
151-
next_url = response.get("_links", {}).get("next", {}).get("href")
152-
if next_url is None:
153-
break
154-
url = next_url
155-
absolute = True
156-
params = {}
157-
trailing = False
144+
url = next_link.get("href")
145+
if url is None:
146+
break
147+
148+
if url.startswith("/"):
149+
# Relative URL from Confluence Server: prepend scheme+host from self.url
150+
parsed = urlparse(self.url)
151+
url = f"{parsed.scheme}://{parsed.netloc}{url}"
152+
153+
# From now on we have absolute URLs with parameters
154+
absolute = True
155+
# Params are now provided by the url
156+
params = {}
157+
# Trailing should not be added as it is already part of the url
158+
trailing = False
158159

159160
return
160161

atlassian/confluence/cloud/__init__.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,14 @@ def get_content_by_type(self, content_type, **kwargs):
2828
"""Get content by type (page, blogpost, etc.)."""
2929
return self.get("content", params={"type": content_type, **kwargs})
3030

31+
def get_all_pages_from_space(self, space_key, **kwargs):
32+
"""Get all pages from space."""
33+
return self._get_paged("content", params={"spaceKey": space_key, "type": "page", **kwargs})
34+
35+
def get_all_blog_posts_from_space(self, space_key, **kwargs):
36+
"""Get all blog posts from space."""
37+
return self._get_paged("content", params={"spaceKey": space_key, "type": "blogpost", **kwargs})
38+
3139
def create_content(self, data, **kwargs):
3240
"""Create new content."""
3341
return self.post("content", data=data, **kwargs)

atlassian/confluence/cloud/base.py

Lines changed: 0 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -24,53 +24,4 @@ def __init__(self, url, *args, **kwargs):
2424
"""
2525
super(ConfluenceCloudBase, self).__init__(url, *args, **kwargs)
2626

27-
def _get_paged(
28-
self,
29-
url,
30-
params=None,
31-
data=None,
32-
flags=None,
33-
trailing=None,
34-
absolute=False,
35-
):
36-
"""
37-
Used to get the paged data for Confluence Cloud
38-
39-
:param url: string: The url to retrieve
40-
:param params: dict (default is None): The parameter's
41-
:param data: dict (default is None): The data
42-
:param flags: string[] (default is None): The flags
43-
:param trailing: bool (default is None): If True, a trailing slash is added to the url
44-
:param absolute: bool (default is False): If True, the url is used absolute and not relative to the root
45-
46-
:return: A generator object for the data elements
47-
"""
48-
if params is None:
49-
params = {}
50-
51-
while True:
52-
response = self.get(
53-
url,
54-
trailing=trailing,
55-
params=params,
56-
data=data,
57-
flags=flags,
58-
absolute=absolute,
59-
)
60-
if "results" not in response:
61-
return
62-
63-
yield from response.get("results", [])
64-
65-
# Confluence Cloud uses _links.next.href for pagination
66-
url = response.get("_links", {}).get("next", {}).get("href")
67-
if url is None:
68-
break
69-
# From now on we have absolute URLs with parameters
70-
absolute = True
71-
# Params are now provided by the url
72-
params = {}
73-
# Trailing should not be added as it is already part of the url
74-
trailing = False
7527

76-
return

atlassian/confluence/server/__init__.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -62,11 +62,11 @@ def get_content_by_id(self, content_id, **kwargs):
6262

6363
def get_all_pages_from_space(self, space_key, **kwargs):
6464
"""Get all pages from space."""
65-
return self.get("content", params={"spaceKey": space_key, "type": "page", **kwargs})
65+
return self._get_paged("content", params={"spaceKey": space_key, "type": "page", **kwargs})
6666

6767
def get_all_blog_posts_from_space(self, space_key, **kwargs):
6868
"""Get all blog posts from space."""
69-
return self.get("content", params={"spaceKey": space_key, "type": "blogpost", **kwargs})
69+
return self._get_paged("content", params={"spaceKey": space_key, "type": "blogpost", **kwargs})
7070

7171
def get_page_by_title(self, space_key, title, **kwargs):
7272
"""Get page by title and space key."""
@@ -195,11 +195,11 @@ def remove_content_label(self, content_id, label_name, **kwargs):
195195

196196
def get_all_pages_by_label(self, label, **kwargs):
197197
"""Get all pages by label."""
198-
return self.get("content", params={"label": label, "type": "page", **kwargs})
198+
return self._get_paged("content", params={"label": label, "type": "page", **kwargs})
199199

200200
def get_all_blog_posts_by_label(self, label, **kwargs):
201201
"""Get all blog posts by label."""
202-
return self.get("content", params={"label": label, "type": "blogpost", **kwargs})
202+
return self._get_paged("content", params={"label": label, "type": "blogpost", **kwargs})
203203

204204
# Attachment Management
205205
def get_attachments(self, content_id, **kwargs):
@@ -293,24 +293,24 @@ def get_draft_content(self, content_id, **kwargs):
293293

294294
def get_all_draft_pages_from_space(self, space_key, **kwargs):
295295
"""Get all draft pages from space."""
296-
return self.get("content", params={"spaceKey": space_key, "type": "page", "status": "draft", **kwargs})
296+
return self._get_paged("content", params={"spaceKey": space_key, "type": "page", "status": "draft", **kwargs})
297297

298298
def get_all_draft_blog_posts_from_space(self, space_key, **kwargs):
299299
"""Get all draft blog posts from space."""
300-
return self.get("content", params={"spaceKey": space_key, "type": "blogpost", "status": "draft", **kwargs})
300+
return self._get_paged("content", params={"spaceKey": space_key, "type": "blogpost", "status": "draft", **kwargs})
301301

302302
# Trash Management
303303
def get_trash_content(self, space_key, **kwargs):
304304
"""Get trash content."""
305-
return self.get("content", params={"spaceKey": space_key, "status": "trashed", **kwargs})
305+
return self._get_paged("content", params={"spaceKey": space_key, "status": "trashed", **kwargs})
306306

307307
def get_all_pages_from_space_trash(self, space_key, **kwargs):
308308
"""Get all pages from space trash."""
309-
return self.get("content", params={"spaceKey": space_key, "type": "page", "status": "trashed", **kwargs})
309+
return self._get_paged("content", params={"spaceKey": space_key, "type": "page", "status": "trashed", **kwargs})
310310

311311
def get_all_blog_posts_from_space_trash(self, space_key, **kwargs):
312312
"""Get all blog posts from space trash."""
313-
return self.get("content", params={"spaceKey": space_key, "type": "blogpost", "status": "trashed", **kwargs})
313+
return self._get_paged("content", params={"spaceKey": space_key, "type": "blogpost", "status": "trashed", **kwargs})
314314

315315
# Export
316316
def export_content(self, content_id, **kwargs):

atlassian/confluence/server/base.py

Lines changed: 0 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -24,54 +24,4 @@ def __init__(self, url, *args, **kwargs):
2424
"""
2525
super(ConfluenceServerBase, self).__init__(url, *args, **kwargs)
2626

27-
def _get_paged(
28-
self,
29-
url,
30-
params=None,
31-
data=None,
32-
flags=None,
33-
trailing=False,
34-
absolute=False,
35-
):
36-
"""
37-
Used to get the paged data for Confluence Server
38-
39-
:param url: string: The url to retrieve
40-
:param params: dict (default is None): The parameter's
41-
:param data: dict (default is None): The data
42-
:param flags: string[] (default is None): The flags
43-
:param trailing: bool (default is None): If True, a trailing slash is added to the url
44-
:param absolute: bool (default is False): If True, the url is used absolute and not relative to the root
45-
46-
:return: A generator object for the data elements
47-
"""
48-
if params is None:
49-
params = {}
50-
51-
while True:
52-
response = self.get(
53-
url,
54-
trailing=trailing,
55-
params=params,
56-
data=data,
57-
flags=flags,
58-
absolute=absolute,
59-
)
60-
if "results" not in response:
61-
return
62-
63-
yield from response.get("results", [])
64-
65-
# Confluence Server uses _links.next.href for pagination
66-
if response.get("_links", {}).get("next") is None:
67-
break
68-
# For server, we need to extract the next page URL from the _links.next.href
69-
next_url = response.get("_links", {}).get("next", {}).get("href")
70-
if next_url is None:
71-
break
72-
url = next_url
73-
absolute = True
74-
params = {}
75-
trailing = False
7627

77-
return

0 commit comments

Comments
 (0)