From 73d9e230fca4c3eb4e0e3f8b5ad65cccb7ecc6c1 Mon Sep 17 00:00:00 2001 From: Harshit Jain <48647625+Harshit28j@users.noreply.github.com> Date: Wed, 24 Dec 2025 01:55:05 +0530 Subject: [PATCH 1/2] fix: skip uploading unchanged files in exporter, compare CRC32C hashes before upload --- gcp/workers/exporter/exporter.py | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/gcp/workers/exporter/exporter.py b/gcp/workers/exporter/exporter.py index 60b083295f9..2f21247f473 100755 --- a/gcp/workers/exporter/exporter.py +++ b/gcp/workers/exporter/exporter.py @@ -14,6 +14,7 @@ # limitations under the License. """OSV Exporter.""" import argparse +import base64 import concurrent.futures import csv import json @@ -22,6 +23,8 @@ import zipfile from typing import List +import google_crc32c + from google.cloud import ndb from google.cloud import storage from google.cloud.storage import retry @@ -211,10 +214,30 @@ def _export_to_file_and_zipfile(bug: osv.Bug): f'{ecosystem}/{filename}') +def _calculate_crc32c(file_path: str) -> str: + """Calculate CRC32C hash of a file and return as base64 string.""" + crc = 0 + with open(file_path, 'rb') as f: + while chunk := f.read(8192): + crc = google_crc32c.extend(crc, chunk) + return base64.b64encode(crc.to_bytes(4, byteorder='big')).decode('utf-8') + + def upload_single(bucket: Bucket, source_path: str, target_path: str): - """Upload a single file to a GCS bucket.""" - logging.info('Uploading %s', target_path) + """Upload a single file to a GCS bucket if content has changed. + + Compares CRC32C hashes to avoid uploading unchanged files. + See https://github.com/google/osv.dev/issues/3513 + """ try: + existing_blob = bucket.get_blob(target_path) + if existing_blob and existing_blob.crc32c: + local_crc = _calculate_crc32c(source_path) + if local_crc == existing_blob.crc32c: + logging.debug('Skipping %s (unchanged)', target_path) + return + + logging.info('Uploading %s', target_path) blob = bucket.blob(target_path) blob.upload_from_filename(source_path, retry=retry.DEFAULT_RETRY) except Exception as e: From e07be325c0c0cd8ea0a1e489212428ef8fcbcab3 Mon Sep 17 00:00:00 2001 From: Harshit Jain Date: Mon, 29 Dec 2025 09:30:25 +0530 Subject: [PATCH 2/2] chore(deps): add google-crc32c in pyproject.toml --- gcp/workers/worker/poetry.lock | 2 +- gcp/workers/worker/pyproject.toml | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/gcp/workers/worker/poetry.lock b/gcp/workers/worker/poetry.lock index f9fd67b9435..12b11395ab1 100644 --- a/gcp/workers/worker/poetry.lock +++ b/gcp/workers/worker/poetry.lock @@ -2013,4 +2013,4 @@ type = ["pytest-mypy"] [metadata] lock-version = "2.1" python-versions = ">=3.13,<3.14" -content-hash = "d415ffbaaa97d13ba0e438119adbe5de03219d01874ca4ec50ad7824a4f738be" +content-hash = "f8c9f6a79eaf74d56909f471f5ed008434578049c6eef16257214e87082616e9" diff --git a/gcp/workers/worker/pyproject.toml b/gcp/workers/worker/pyproject.toml index 2c3af9f4694..042d8b97f6c 100644 --- a/gcp/workers/worker/pyproject.toml +++ b/gcp/workers/worker/pyproject.toml @@ -7,6 +7,7 @@ dependencies = [ "google-cloud-pubsub==2.34.0", "google-cloud-ndb==2.4.0", "google-cloud-storage==2.19.0", + "google-crc32c>=1.0.0", "pyyaml==6.0.3", "redis==5.3.1", "packageurl-python==0.17.6",