Skip to content

Commit 02d63d3

Browse files
committed
Create and expose metadata file
1 parent 37626da commit 02d63d3

File tree

12 files changed

+492
-16
lines changed

12 files changed

+492
-16
lines changed

pulp_python/app/management/commands/repair-python-metadata.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,14 @@ def repair_metadata(content):
2424
set_of_update_fields = set()
2525
total_repaired = 0
2626
for package in immediate_content.prefetch_related("_artifacts").iterator(chunk_size=1000):
27+
# Get the main artifact
28+
main_artifact = (
29+
package.contentartifact_set.exclude(relative_path__endswith=".metadata")
30+
.first()
31+
.artifact
32+
)
2733
new_data = artifact_to_python_content_data(
28-
package.filename, package._artifacts.get(), package.pulp_domain
34+
package.filename, main_artifact, package.pulp_domain
2935
)
3036
changed = False
3137
for field, value in new_data.items():
Lines changed: 215 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,215 @@
1+
# Generated manually on 2025-12-15 14:00 for creating missing metadata artifacts
2+
3+
from django.db import migrations
4+
5+
6+
def pulp_hashlib_new(name, *args, **kwargs):
7+
"""
8+
Copied and updated (to comply with migrations) from pulpcore.
9+
"""
10+
import hashlib as the_real_hashlib
11+
from django.conf import settings
12+
13+
if name not in settings.ALLOWED_CONTENT_CHECKSUMS:
14+
return None
15+
16+
return the_real_hashlib.new(name, *args, **kwargs)
17+
18+
19+
def init_and_validate(file, artifact_model, expected_digests=None, expected_size=None):
20+
"""
21+
Copied and updated (to comply with migrations) from pulpcore.
22+
"""
23+
from django.conf import settings
24+
25+
digest_fields = []
26+
for alg in ("sha512", "sha384", "sha256", "sha224", "sha1", "md5"):
27+
if alg in settings.ALLOWED_CONTENT_CHECKSUMS:
28+
digest_fields.append(alg)
29+
30+
if isinstance(file, str):
31+
with open(file, "rb") as f:
32+
hashers = {
33+
n: hasher for n in digest_fields if (hasher := pulp_hashlib_new(n)) is not None
34+
}
35+
if not hashers:
36+
return None
37+
38+
size = 0
39+
while True:
40+
chunk = f.read(1048576) # 1 megabyte
41+
if not chunk:
42+
break
43+
for algorithm in hashers.values():
44+
algorithm.update(chunk)
45+
size = size + len(chunk)
46+
else:
47+
size = file.size
48+
hashers = file.hashers
49+
50+
if expected_size:
51+
if size != expected_size:
52+
return None
53+
54+
if expected_digests:
55+
for algorithm, expected_digest in expected_digests.items():
56+
if algorithm not in hashers:
57+
return None
58+
actual_digest = hashers[algorithm].hexdigest()
59+
if expected_digest != actual_digest:
60+
return None
61+
62+
attributes = {"size": size, "file": file}
63+
for algorithm in digest_fields:
64+
attributes[algorithm] = hashers[algorithm].hexdigest()
65+
66+
return artifact_model(**attributes)
67+
68+
69+
def extract_wheel_metadata(filename):
70+
"""
71+
Extract the metadata file content from a wheel file.
72+
Returns the raw metadata content as bytes or None if metadata cannot be extracted.
73+
"""
74+
import zipfile
75+
76+
if not filename.endswith(".whl"):
77+
return None
78+
try:
79+
with zipfile.ZipFile(filename, "r") as f:
80+
for file_path in f.namelist():
81+
if file_path.endswith(".dist-info/METADATA"):
82+
return f.read(file_path)
83+
except (zipfile.BadZipFile, KeyError, OSError):
84+
pass
85+
return None
86+
87+
88+
def artifact_to_metadata_artifact(filename, artifact, tmp_dir, artifact_model):
89+
"""
90+
Creates artifact for metadata from the provided wheel artifact.
91+
"""
92+
import os
93+
import shutil
94+
import tempfile
95+
96+
if not filename.endswith(".whl"):
97+
return None
98+
99+
temp_wheel_path = None
100+
temp_metadata_path = None
101+
try:
102+
with tempfile.NamedTemporaryFile(
103+
"wb", dir=tmp_dir, suffix=filename, delete=False
104+
) as temp_file:
105+
temp_wheel_path = temp_file.name
106+
artifact.file.seek(0)
107+
shutil.copyfileobj(artifact.file, temp_file)
108+
temp_file.flush()
109+
110+
metadata_content = extract_wheel_metadata(temp_wheel_path)
111+
if not metadata_content:
112+
return None
113+
114+
with tempfile.NamedTemporaryFile(
115+
"wb", dir=tmp_dir, suffix=".metadata", delete=False
116+
) as temp_md:
117+
temp_metadata_path = temp_md.name
118+
temp_md.write(metadata_content)
119+
temp_md.flush()
120+
121+
# todo: pass metadata_sha256 from PPC to expected_digests in init_and_validate?
122+
# if not, simplify init_and_validate
123+
metadata_artifact = init_and_validate(temp_metadata_path, artifact_model)
124+
if not metadata_artifact:
125+
return None
126+
127+
try:
128+
metadata_artifact.save()
129+
except Exception:
130+
return None
131+
132+
return metadata_artifact
133+
134+
finally:
135+
if temp_wheel_path and os.path.exists(temp_wheel_path):
136+
os.unlink(temp_wheel_path)
137+
if temp_metadata_path and os.path.exists(temp_metadata_path):
138+
os.unlink(temp_metadata_path)
139+
140+
141+
# todo: bulk create?
142+
def create_missing_metadata_artifacts(apps, schema_editor):
143+
"""
144+
Create metadata artifacts for PythonPackageContent instances that have metadata_sha256
145+
but are missing the corresponding metadata artifact.
146+
"""
147+
import tempfile
148+
from django.conf import settings
149+
150+
PythonPackageContent = apps.get_model("python", "PythonPackageContent")
151+
ContentArtifact = apps.get_model("core", "ContentArtifact")
152+
Artifact = apps.get_model("core", "Artifact")
153+
154+
packages = (
155+
PythonPackageContent.objects.filter(metadata_sha256__isnull=False)
156+
.exclude(metadata_sha256="")
157+
.prefetch_related("contentartifact_set")
158+
)
159+
# todo: only for testing, remove later
160+
created_count = 0
161+
skipped_count = 0
162+
163+
# todo: do i need temp dir? (not needed in localhost)
164+
with tempfile.TemporaryDirectory(dir=settings.WORKING_DIRECTORY) as temp_dir:
165+
for package in packages:
166+
metadata_relative_path = f"{package.filename}.metadata"
167+
content_artifacts = list(package.contentartifact_set.all())
168+
169+
if any(ca.relative_path == metadata_relative_path for ca in content_artifacts):
170+
# Metadata artifact already exist
171+
continue
172+
173+
main_content_artifact = next(
174+
(ca for ca in content_artifacts if ca.relative_path == package.filename),
175+
None,
176+
)
177+
if not main_content_artifact:
178+
# Main artifact does not exist
179+
skipped_count += 1
180+
continue
181+
182+
metadata_artifact = artifact_to_metadata_artifact(
183+
package.filename, main_content_artifact.artifact, temp_dir, Artifact
184+
)
185+
if not metadata_artifact:
186+
# Failed to create metadata artifact
187+
skipped_count += 1
188+
continue
189+
190+
try:
191+
ContentArtifact.objects.create(
192+
artifact=metadata_artifact,
193+
content=package,
194+
relative_path=metadata_relative_path,
195+
)
196+
created_count += 1
197+
except Exception:
198+
# Failed to save metadata artifact
199+
skipped_count += 1
200+
201+
print(f"Created {created_count} missing metadata artifacts. Skipped {skipped_count} packages.")
202+
203+
204+
class Migration(migrations.Migration):
205+
206+
dependencies = [
207+
("python", "0018_packageprovenance"),
208+
]
209+
210+
operations = [
211+
migrations.RunPython(
212+
create_missing_metadata_artifacts,
213+
reverse_code=migrations.RunPython.noop,
214+
),
215+
]

pulp_python/app/serializers.py

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import logging
22
import os
3+
import tempfile
34
from gettext import gettext as _
45
from django.conf import settings
56
from django.db.utils import IntegrityError
@@ -22,6 +23,7 @@
2223
)
2324
from pulp_python.app.utils import (
2425
DIST_EXTENSIONS,
26+
artifact_to_metadata_artifact,
2527
artifact_to_python_content_data,
2628
get_project_metadata_from_file,
2729
parse_project_metadata,
@@ -93,11 +95,31 @@ class Meta:
9395
model = python_models.PythonDistribution
9496

9597

98+
class PythonSingleContentArtifactField(core_serializers.SingleContentArtifactField):
99+
"""
100+
Custom field with overridden get_attribute method. Meant to be used only in
101+
PythonPackageContentSerializer to handle possible existence of metadata artifact.
102+
"""
103+
104+
def get_attribute(self, instance):
105+
# When content has multiple artifacts (wheel + metadata), return the main one
106+
if instance._artifacts.count() > 1:
107+
for ca in instance.contentartifact_set.all():
108+
if not ca.relative_path.endswith(".metadata"):
109+
return ca.artifact
110+
111+
return super().get_attribute(instance)
112+
113+
96114
class PythonPackageContentSerializer(core_serializers.SingleArtifactContentUploadSerializer):
97115
"""
98116
A Serializer for PythonPackageContent.
99117
"""
100118

119+
artifact = PythonSingleContentArtifactField(
120+
help_text=_("Artifact file representing the physical content"),
121+
)
122+
101123
# Core metadata
102124
# Version 1.0
103125
author = serializers.CharField(
@@ -386,8 +408,21 @@ def deferred_validate(self, data):
386408
if attestations := data.pop("attestations", None):
387409
data["provenance"] = self.handle_attestations(filename, data["sha256"], attestations)
388410

411+
# Create metadata artifact for wheel files
412+
if filename.endswith(".whl"):
413+
if metadata_artifact := artifact_to_metadata_artifact(filename, artifact):
414+
data["metadata_artifact"] = metadata_artifact
415+
data["metadata_sha256"] = metadata_artifact.sha256
416+
389417
return data
390418

419+
def get_artifacts(self, validated_data):
420+
artifacts = super().get_artifacts(validated_data)
421+
if metadata_artifact := validated_data.pop("metadata_artifact", None):
422+
relative_path = f"{validated_data['filename']}.metadata"
423+
artifacts[relative_path] = metadata_artifact
424+
return artifacts
425+
391426
def retrieve(self, validated_data):
392427
content = python_models.PythonPackageContent.objects.filter(
393428
sha256=validated_data["sha256"], _pulp_domain=get_domain()
@@ -419,6 +454,7 @@ def create(self, validated_data):
419454

420455
class Meta:
421456
fields = core_serializers.SingleArtifactContentUploadSerializer.Meta.fields + (
457+
"artifact",
422458
"author",
423459
"author_email",
424460
"description",
@@ -514,6 +550,15 @@ def validate(self, data):
514550
data["provenance"] = self.handle_attestations(
515551
filename, data["sha256"], attestations, offline=True
516552
)
553+
# Create metadata artifact for wheel files
554+
if filename.endswith(".whl"):
555+
with tempfile.TemporaryDirectory(dir=settings.WORKING_DIRECTORY) as temp_dir:
556+
if metadata_artifact := artifact_to_metadata_artifact(
557+
filename, artifact, tmp_dir=temp_dir
558+
):
559+
data["metadata_artifact"] = metadata_artifact
560+
data["metadata_sha256"] = metadata_artifact.sha256
561+
517562
return data
518563

519564
class Meta(PythonPackageContentSerializer.Meta):

pulp_python/app/tasks/repair.py

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -95,9 +95,13 @@ def repair_metadata(content: QuerySet[PythonPackageContent]) -> tuple[int, set[s
9595
progress_report.save()
9696
with progress_report:
9797
for package in progress_report.iter(immediate_content.iterator(chunk_size=BULK_SIZE)):
98-
new_data = artifact_to_python_content_data(
99-
package.filename, package._artifacts.get(), domain
98+
# Get the main artifact
99+
main_artifact = (
100+
package.contentartifact_set.exclude(relative_path__endswith=".metadata")
101+
.first()
102+
.artifact
100103
)
104+
new_data = artifact_to_python_content_data(package.filename, main_artifact, domain)
101105
total_repaired += update_package_if_needed(
102106
package, new_data, batch, set_of_update_fields
103107
)
@@ -113,7 +117,11 @@ def repair_metadata(content: QuerySet[PythonPackageContent]) -> tuple[int, set[s
113117
grouped_by_url = defaultdict(list)
114118

115119
for package in group_set:
116-
for ra in package.contentartifact_set.get().remoteartifact_set.all():
120+
for ra in (
121+
package.contentartifact_set.exclude(relative_path__endswith=".metadata")
122+
.first()
123+
.remoteartifact_set.all()
124+
):
117125
grouped_by_url[ra.remote.url].append((package, ra))
118126

119127
# Prioritize the URL that can serve the most packages

0 commit comments

Comments
 (0)