From 187bd3ac36c3ceef6d90b2e0011769e9f024e4c5 Mon Sep 17 00:00:00 2001 From: kmontemayor Date: Tue, 28 Apr 2026 18:24:34 +0000 Subject: [PATCH 01/59] Update generated protobuf artifacts --- .../research/gbml/gigl_resource_config.proto | 4 + .../DistributedInferencerConfig.scala | 4 +- .../GiglResourceConfigProto.scala | 140 +++++++++--------- .../VertexAiResourceConfig.scala | 41 ++++- .../DistributedInferencerConfig.scala | 4 +- .../GiglResourceConfigProto.scala | 140 +++++++++--------- .../VertexAiResourceConfig.scala | 41 ++++- .../research/gbml/gigl_resource_config_pb2.py | 48 +++--- .../gbml/gigl_resource_config_pb2.pyi | 8 +- 9 files changed, 255 insertions(+), 175 deletions(-) diff --git a/proto/snapchat/research/gbml/gigl_resource_config.proto b/proto/snapchat/research/gbml/gigl_resource_config.proto index 0d930949b..f7e30bc8f 100644 --- a/proto/snapchat/research/gbml/gigl_resource_config.proto +++ b/proto/snapchat/research/gbml/gigl_resource_config.proto @@ -130,6 +130,10 @@ message VertexAiResourceConfig { // Compute Engine reservation affinity for the job. // See https://docs.cloud.google.com/vertex-ai/docs/training/use-reservations VertexAiReservationAffinity reservation_affinity = 9; + + // Existing Vertex AI TensorBoard resource to attach to the job. + // Format: projects/{project}/locations/{region}/tensorboards/{tensorboard_id} + string tensorboard_resource_name = 10; } // Configuration for KFP job resources diff --git a/scala/common/src/main/scala/snapchat/research/gbml/gigl_resource_config/DistributedInferencerConfig.scala b/scala/common/src/main/scala/snapchat/research/gbml/gigl_resource_config/DistributedInferencerConfig.scala index 8363bdb1f..2198a2eb5 100644 --- a/scala/common/src/main/scala/snapchat/research/gbml/gigl_resource_config/DistributedInferencerConfig.scala +++ b/scala/common/src/main/scala/snapchat/research/gbml/gigl_resource_config/DistributedInferencerConfig.scala @@ -38,7 +38,7 @@ final case class DistributedInferencerConfig( __serializedSizeMemoized = __size } __size - 1 - + } def writeTo(`_output__`: _root_.com.google.protobuf.CodedOutputStream): _root_.scala.Unit = { trainerConfig.vertexAiInferencerConfig.foreach { __v => @@ -165,7 +165,7 @@ object DistributedInferencerConfig extends scalapb.GeneratedMessageCompanion[sna override def number: _root_.scala.Int = 0 override def value: _root_.scala.Nothing = throw new java.util.NoSuchElementException("Empty.value") } - + @SerialVersionUID(0L) final case class VertexAiInferencerConfig(value: snapchat.research.gbml.gigl_resource_config.VertexAiResourceConfig) extends snapchat.research.gbml.gigl_resource_config.DistributedInferencerConfig.TrainerConfig { type ValueType = snapchat.research.gbml.gigl_resource_config.VertexAiResourceConfig diff --git a/scala/common/src/main/scala/snapchat/research/gbml/gigl_resource_config/GiglResourceConfigProto.scala b/scala/common/src/main/scala/snapchat/research/gbml/gigl_resource_config/GiglResourceConfigProto.scala index a086f6113..94ffd417b 100644 --- a/scala/common/src/main/scala/snapchat/research/gbml/gigl_resource_config/GiglResourceConfigProto.scala +++ b/scala/common/src/main/scala/snapchat/research/gbml/gigl_resource_config/GiglResourceConfigProto.scala @@ -48,7 +48,7 @@ object GiglResourceConfigProto extends _root_.scalapb.GeneratedFileObject { XQSMwoMbnVtX3JlcGxpY2FzGAUgASgNQhDiPw0SC251bVJlcGxpY2FzUgtudW1SZXBsaWNhcyJGChJMb2NhbFRyYWluZXJDb25ma WcSMAoLbnVtX3dvcmtlcnMYASABKA1CD+I/DBIKbnVtV29ya2Vyc1IKbnVtV29ya2VycyKZAQobVmVydGV4QWlSZXNlcnZhdGlvb kFmZmluaXR5Eh0KBHR5cGUYASABKAlCCeI/BhIEdHlwZVIEdHlwZRJbChpyZXNlcnZhdGlvbl9yZXNvdXJjZV9uYW1lcxgCIAMoC - UId4j8aEhhyZXNlcnZhdGlvblJlc291cmNlTmFtZXNSGHJlc2VydmF0aW9uUmVzb3VyY2VOYW1lcyLUBAoWVmVydGV4QWlSZXNvd + UId4j8aEhhyZXNlcnZhdGlvblJlc291cmNlTmFtZXNSGHJlc2VydmF0aW9uUmVzb3VyY2VOYW1lcyKuBQoWVmVydGV4QWlSZXNvd XJjZUNvbmZpZxIzCgxtYWNoaW5lX3R5cGUYASABKAlCEOI/DRILbWFjaGluZVR5cGVSC21hY2hpbmVUeXBlEicKCGdwdV90eXBlG AIgASgJQgziPwkSB2dwdVR5cGVSB2dwdVR5cGUSKgoJZ3B1X2xpbWl0GAMgASgNQg3iPwoSCGdwdUxpbWl0UghncHVMaW1pdBIzC gxudW1fcmVwbGljYXMYBCABKA1CEOI/DRILbnVtUmVwbGljYXNSC251bVJlcGxpY2FzEiYKB3RpbWVvdXQYBSABKA1CDOI/CRIHd @@ -56,74 +56,76 @@ object GiglResourceConfigProto extends _root_.scalapb.GeneratedFileObject { Wdpb25PdmVycmlkZRJIChNzY2hlZHVsaW5nX3N0cmF0ZWd5GAcgASgJQhfiPxQSEnNjaGVkdWxpbmdTdHJhdGVneVISc2NoZWR1b GluZ1N0cmF0ZWd5Ej4KEWJvb3RfZGlza19zaXplX2diGAggASgNQhPiPxASDmJvb3REaXNrU2l6ZUdiUg5ib290RGlza1NpemVHY hKAAQoUcmVzZXJ2YXRpb25fYWZmaW5pdHkYCSABKAsyMy5zbmFwY2hhdC5yZXNlYXJjaC5nYm1sLlZlcnRleEFpUmVzZXJ2YXRpb - 25BZmZpbml0eUIY4j8VEhNyZXNlcnZhdGlvbkFmZmluaXR5UhNyZXNlcnZhdGlvbkFmZmluaXR5IooCChFLRlBSZXNvdXJjZUNvb - mZpZxIwCgtjcHVfcmVxdWVzdBgBIAEoCUIP4j8MEgpjcHVSZXF1ZXN0UgpjcHVSZXF1ZXN0EjkKDm1lbW9yeV9yZXF1ZXN0GAIgA - SgJQhLiPw8SDW1lbW9yeVJlcXVlc3RSDW1lbW9yeVJlcXVlc3QSJwoIZ3B1X3R5cGUYAyABKAlCDOI/CRIHZ3B1VHlwZVIHZ3B1V - HlwZRIqCglncHVfbGltaXQYBCABKA1CDeI/ChIIZ3B1TGltaXRSCGdwdUxpbWl0EjMKDG51bV9yZXBsaWNhcxgFIAEoDUIQ4j8NE - gtudW1SZXBsaWNhc1ILbnVtUmVwbGljYXMiRwoTTG9jYWxSZXNvdXJjZUNvbmZpZxIwCgtudW1fd29ya2VycxgBIAEoDUIP4j8ME - gpudW1Xb3JrZXJzUgpudW1Xb3JrZXJzItkCChhWZXJ0ZXhBaUdyYXBoU3RvcmVDb25maWcSbQoQZ3JhcGhfc3RvcmVfcG9vbBgBI - AEoCzIuLnNuYXBjaGF0LnJlc2VhcmNoLmdibWwuVmVydGV4QWlSZXNvdXJjZUNvbmZpZ0IT4j8QEg5ncmFwaFN0b3JlUG9vbFIOZ - 3JhcGhTdG9yZVBvb2wSYwoMY29tcHV0ZV9wb29sGAIgASgLMi4uc25hcGNoYXQucmVzZWFyY2guZ2JtbC5WZXJ0ZXhBaVJlc291c - mNlQ29uZmlnQhDiPw0SC2NvbXB1dGVQb29sUgtjb21wdXRlUG9vbBJpCiBjb21wdXRlX2NsdXN0ZXJfbG9jYWxfd29ybGRfc2l6Z - RgDIAEoBUIh4j8eEhxjb21wdXRlQ2x1c3RlckxvY2FsV29ybGRTaXplUhxjb21wdXRlQ2x1c3RlckxvY2FsV29ybGRTaXplIp0DC - hhEaXN0cmlidXRlZFRyYWluZXJDb25maWcShAEKGHZlcnRleF9haV90cmFpbmVyX2NvbmZpZxgBIAEoCzItLnNuYXBjaGF0LnJlc - 2VhcmNoLmdibWwuVmVydGV4QWlUcmFpbmVyQ29uZmlnQhriPxcSFXZlcnRleEFpVHJhaW5lckNvbmZpZ0gAUhV2ZXJ0ZXhBaVRyY - WluZXJDb25maWcSbwoSa2ZwX3RyYWluZXJfY29uZmlnGAIgASgLMiguc25hcGNoYXQucmVzZWFyY2guZ2JtbC5LRlBUcmFpbmVyQ - 29uZmlnQhXiPxISEGtmcFRyYWluZXJDb25maWdIAFIQa2ZwVHJhaW5lckNvbmZpZxJ3ChRsb2NhbF90cmFpbmVyX2NvbmZpZxgDI - AEoCzIqLnNuYXBjaGF0LnJlc2VhcmNoLmdibWwuTG9jYWxUcmFpbmVyQ29uZmlnQhfiPxQSEmxvY2FsVHJhaW5lckNvbmZpZ0gAU - hJsb2NhbFRyYWluZXJDb25maWdCEAoOdHJhaW5lcl9jb25maWcixwQKFVRyYWluZXJSZXNvdXJjZUNvbmZpZxKFAQoYdmVydGV4X - 2FpX3RyYWluZXJfY29uZmlnGAEgASgLMi4uc25hcGNoYXQucmVzZWFyY2guZ2JtbC5WZXJ0ZXhBaVJlc291cmNlQ29uZmlnQhriP - xcSFXZlcnRleEFpVHJhaW5lckNvbmZpZ0gAUhV2ZXJ0ZXhBaVRyYWluZXJDb25maWcScAoSa2ZwX3RyYWluZXJfY29uZmlnGAIgA - SgLMikuc25hcGNoYXQucmVzZWFyY2guZ2JtbC5LRlBSZXNvdXJjZUNvbmZpZ0IV4j8SEhBrZnBUcmFpbmVyQ29uZmlnSABSEGtmc - FRyYWluZXJDb25maWcSeAoUbG9jYWxfdHJhaW5lcl9jb25maWcYAyABKAsyKy5zbmFwY2hhdC5yZXNlYXJjaC5nYm1sLkxvY2FsU - mVzb3VyY2VDb25maWdCF+I/FBISbG9jYWxUcmFpbmVyQ29uZmlnSABSEmxvY2FsVHJhaW5lckNvbmZpZxKnAQokdmVydGV4X2FpX - 2dyYXBoX3N0b3JlX3RyYWluZXJfY29uZmlnGAQgASgLMjAuc25hcGNoYXQucmVzZWFyY2guZ2JtbC5WZXJ0ZXhBaUdyYXBoU3Rvc - mVDb25maWdCJOI/IRIfdmVydGV4QWlHcmFwaFN0b3JlVHJhaW5lckNvbmZpZ0gAUh92ZXJ0ZXhBaUdyYXBoU3RvcmVUcmFpbmVyQ - 29uZmlnQhAKDnRyYWluZXJfY29uZmlnIocFChhJbmZlcmVuY2VyUmVzb3VyY2VDb25maWcSjgEKG3ZlcnRleF9haV9pbmZlcmVuY - 2VyX2NvbmZpZxgBIAEoCzIuLnNuYXBjaGF0LnJlc2VhcmNoLmdibWwuVmVydGV4QWlSZXNvdXJjZUNvbmZpZ0Id4j8aEhh2ZXJ0Z - XhBaUluZmVyZW5jZXJDb25maWdIAFIYdmVydGV4QWlJbmZlcmVuY2VyQ29uZmlnEo0BChpkYXRhZmxvd19pbmZlcmVuY2VyX2Nvb - mZpZxgCIAEoCzIuLnNuYXBjaGF0LnJlc2VhcmNoLmdibWwuRGF0YWZsb3dSZXNvdXJjZUNvbmZpZ0Id4j8aEhhkYXRhZmxvd0luZ - mVyZW5jZXJDb25maWdIAFIYZGF0YWZsb3dJbmZlcmVuY2VyQ29uZmlnEoEBChdsb2NhbF9pbmZlcmVuY2VyX2NvbmZpZxgDIAEoC - zIrLnNuYXBjaGF0LnJlc2VhcmNoLmdibWwuTG9jYWxSZXNvdXJjZUNvbmZpZ0Ia4j8XEhVsb2NhbEluZmVyZW5jZXJDb25maWdIA - FIVbG9jYWxJbmZlcmVuY2VyQ29uZmlnErABCid2ZXJ0ZXhfYWlfZ3JhcGhfc3RvcmVfaW5mZXJlbmNlcl9jb25maWcYBCABKAsyM - C5zbmFwY2hhdC5yZXNlYXJjaC5nYm1sLlZlcnRleEFpR3JhcGhTdG9yZUNvbmZpZ0In4j8kEiJ2ZXJ0ZXhBaUdyYXBoU3RvcmVJb - mZlcmVuY2VyQ29uZmlnSABSInZlcnRleEFpR3JhcGhTdG9yZUluZmVyZW5jZXJDb25maWdCEwoRaW5mZXJlbmNlcl9jb25maWcil - wgKFFNoYXJlZFJlc291cmNlQ29uZmlnEn4KD3Jlc291cmNlX2xhYmVscxgBIAMoCzJALnNuYXBjaGF0LnJlc2VhcmNoLmdibWwuU - 2hhcmVkUmVzb3VyY2VDb25maWcuUmVzb3VyY2VMYWJlbHNFbnRyeUIT4j8QEg5yZXNvdXJjZUxhYmVsc1IOcmVzb3VyY2VMYWJlb - HMSjgEKFWNvbW1vbl9jb21wdXRlX2NvbmZpZxgCIAEoCzJALnNuYXBjaGF0LnJlc2VhcmNoLmdibWwuU2hhcmVkUmVzb3VyY2VDb - 25maWcuQ29tbW9uQ29tcHV0ZUNvbmZpZ0IY4j8VEhNjb21tb25Db21wdXRlQ29uZmlnUhNjb21tb25Db21wdXRlQ29uZmlnGpQFC - hNDb21tb25Db21wdXRlQ29uZmlnEiYKB3Byb2plY3QYASABKAlCDOI/CRIHcHJvamVjdFIHcHJvamVjdBIjCgZyZWdpb24YAiABK - AlCC+I/CBIGcmVnaW9uUgZyZWdpb24SQwoSdGVtcF9hc3NldHNfYnVja2V0GAMgASgJQhXiPxISEHRlbXBBc3NldHNCdWNrZXRSE - HRlbXBBc3NldHNCdWNrZXQSXAobdGVtcF9yZWdpb25hbF9hc3NldHNfYnVja2V0GAQgASgJQh3iPxoSGHRlbXBSZWdpb25hbEFzc - 2V0c0J1Y2tldFIYdGVtcFJlZ2lvbmFsQXNzZXRzQnVja2V0EkMKEnBlcm1fYXNzZXRzX2J1Y2tldBgFIAEoCUIV4j8SEhBwZXJtQ - XNzZXRzQnVja2V0UhBwZXJtQXNzZXRzQnVja2V0EloKG3RlbXBfYXNzZXRzX2JxX2RhdGFzZXRfbmFtZRgGIAEoCUIc4j8ZEhd0Z - W1wQXNzZXRzQnFEYXRhc2V0TmFtZVIXdGVtcEFzc2V0c0JxRGF0YXNldE5hbWUSVgoZZW1iZWRkaW5nX2JxX2RhdGFzZXRfbmFtZ - RgHIAEoCUIb4j8YEhZlbWJlZGRpbmdCcURhdGFzZXROYW1lUhZlbWJlZGRpbmdCcURhdGFzZXROYW1lElYKGWdjcF9zZXJ2aWNlX - 2FjY291bnRfZW1haWwYCCABKAlCG+I/GBIWZ2NwU2VydmljZUFjY291bnRFbWFpbFIWZ2NwU2VydmljZUFjY291bnRFbWFpbBI8C - g9kYXRhZmxvd19ydW5uZXIYCyABKAlCE+I/EBIOZGF0YWZsb3dSdW5uZXJSDmRhdGFmbG93UnVubmVyGlcKE1Jlc291cmNlTGFiZ - WxzRW50cnkSGgoDa2V5GAEgASgJQgjiPwUSA2tleVIDa2V5EiAKBXZhbHVlGAIgASgJQgriPwcSBXZhbHVlUgV2YWx1ZToCOAEi9 - wgKEkdpZ2xSZXNvdXJjZUNvbmZpZxJbChpzaGFyZWRfcmVzb3VyY2VfY29uZmlnX3VyaRgBIAEoCUIc4j8ZEhdzaGFyZWRSZXNvd - XJjZUNvbmZpZ1VyaUgAUhdzaGFyZWRSZXNvdXJjZUNvbmZpZ1VyaRJ/ChZzaGFyZWRfcmVzb3VyY2VfY29uZmlnGAIgASgLMiwuc - 25hcGNoYXQucmVzZWFyY2guZ2JtbC5TaGFyZWRSZXNvdXJjZUNvbmZpZ0IZ4j8WEhRzaGFyZWRSZXNvdXJjZUNvbmZpZ0gAUhRza - GFyZWRSZXNvdXJjZUNvbmZpZxJ4ChNwcmVwcm9jZXNzb3JfY29uZmlnGAwgASgLMi4uc25hcGNoYXQucmVzZWFyY2guZ2JtbC5EY - XRhUHJlcHJvY2Vzc29yQ29uZmlnQhfiPxQSEnByZXByb2Nlc3NvckNvbmZpZ1IScHJlcHJvY2Vzc29yQ29uZmlnEn8KF3N1YmdyY - XBoX3NhbXBsZXJfY29uZmlnGA0gASgLMisuc25hcGNoYXQucmVzZWFyY2guZ2JtbC5TcGFya1Jlc291cmNlQ29uZmlnQhriPxcSF - XN1YmdyYXBoU2FtcGxlckNvbmZpZ1IVc3ViZ3JhcGhTYW1wbGVyQ29uZmlnEnwKFnNwbGl0X2dlbmVyYXRvcl9jb25maWcYDiABK - AsyKy5zbmFwY2hhdC5yZXNlYXJjaC5nYm1sLlNwYXJrUmVzb3VyY2VDb25maWdCGeI/FhIUc3BsaXRHZW5lcmF0b3JDb25maWdSF - HNwbGl0R2VuZXJhdG9yQ29uZmlnEm0KDnRyYWluZXJfY29uZmlnGA8gASgLMjAuc25hcGNoYXQucmVzZWFyY2guZ2JtbC5EaXN0c - mlidXRlZFRyYWluZXJDb25maWdCFBgB4j8PEg10cmFpbmVyQ29uZmlnUg10cmFpbmVyQ29uZmlnEnQKEWluZmVyZW5jZXJfY29uZ - mlnGBAgASgLMi4uc25hcGNoYXQucmVzZWFyY2guZ2JtbC5EYXRhZmxvd1Jlc291cmNlQ29uZmlnQhcYAeI/EhIQaW5mZXJlbmNlc - kNvbmZpZ1IQaW5mZXJlbmNlckNvbmZpZxKBAQoXdHJhaW5lcl9yZXNvdXJjZV9jb25maWcYESABKAsyLS5zbmFwY2hhdC5yZXNlY - XJjaC5nYm1sLlRyYWluZXJSZXNvdXJjZUNvbmZpZ0Ia4j8XEhV0cmFpbmVyUmVzb3VyY2VDb25maWdSFXRyYWluZXJSZXNvdXJjZ - UNvbmZpZxKNAQoaaW5mZXJlbmNlcl9yZXNvdXJjZV9jb25maWcYEiABKAsyMC5zbmFwY2hhdC5yZXNlYXJjaC5nYm1sLkluZmVyZ - W5jZXJSZXNvdXJjZUNvbmZpZ0Id4j8aEhhpbmZlcmVuY2VyUmVzb3VyY2VDb25maWdSGGluZmVyZW5jZXJSZXNvdXJjZUNvbmZpZ - 0IRCg9zaGFyZWRfcmVzb3VyY2Uq4wMKCUNvbXBvbmVudBItChFDb21wb25lbnRfVW5rbm93bhAAGhbiPxMSEUNvbXBvbmVudF9Vb - mtub3duEj8KGkNvbXBvbmVudF9Db25maWdfVmFsaWRhdG9yEAEaH+I/HBIaQ29tcG9uZW50X0NvbmZpZ19WYWxpZGF0b3ISPwoaQ - 29tcG9uZW50X0NvbmZpZ19Qb3B1bGF0b3IQAhof4j8cEhpDb21wb25lbnRfQ29uZmlnX1BvcHVsYXRvchJBChtDb21wb25lbnRfR - GF0YV9QcmVwcm9jZXNzb3IQAxog4j8dEhtDb21wb25lbnRfRGF0YV9QcmVwcm9jZXNzb3ISPwoaQ29tcG9uZW50X1N1YmdyYXBoX - 1NhbXBsZXIQBBof4j8cEhpDb21wb25lbnRfU3ViZ3JhcGhfU2FtcGxlchI9ChlDb21wb25lbnRfU3BsaXRfR2VuZXJhdG9yEAUaH - uI/GxIZQ29tcG9uZW50X1NwbGl0X0dlbmVyYXRvchItChFDb21wb25lbnRfVHJhaW5lchAGGhbiPxMSEUNvbXBvbmVudF9UcmFpb - mVyEjMKFENvbXBvbmVudF9JbmZlcmVuY2VyEAcaGeI/FhIUQ29tcG9uZW50X0luZmVyZW5jZXJiBnByb3RvMw==""" + 25BZmZpbml0eUIY4j8VEhNyZXNlcnZhdGlvbkFmZmluaXR5UhNyZXNlcnZhdGlvbkFmZmluaXR5ElgKGXRlbnNvcmJvYXJkX3Jlc + 291cmNlX25hbWUYCiABKAlCHOI/GRIXdGVuc29yYm9hcmRSZXNvdXJjZU5hbWVSF3RlbnNvcmJvYXJkUmVzb3VyY2VOYW1lIooCC + hFLRlBSZXNvdXJjZUNvbmZpZxIwCgtjcHVfcmVxdWVzdBgBIAEoCUIP4j8MEgpjcHVSZXF1ZXN0UgpjcHVSZXF1ZXN0EjkKDm1lb + W9yeV9yZXF1ZXN0GAIgASgJQhLiPw8SDW1lbW9yeVJlcXVlc3RSDW1lbW9yeVJlcXVlc3QSJwoIZ3B1X3R5cGUYAyABKAlCDOI/C + RIHZ3B1VHlwZVIHZ3B1VHlwZRIqCglncHVfbGltaXQYBCABKA1CDeI/ChIIZ3B1TGltaXRSCGdwdUxpbWl0EjMKDG51bV9yZXBsa + WNhcxgFIAEoDUIQ4j8NEgtudW1SZXBsaWNhc1ILbnVtUmVwbGljYXMiRwoTTG9jYWxSZXNvdXJjZUNvbmZpZxIwCgtudW1fd29ya + 2VycxgBIAEoDUIP4j8MEgpudW1Xb3JrZXJzUgpudW1Xb3JrZXJzItkCChhWZXJ0ZXhBaUdyYXBoU3RvcmVDb25maWcSbQoQZ3Jhc + Ghfc3RvcmVfcG9vbBgBIAEoCzIuLnNuYXBjaGF0LnJlc2VhcmNoLmdibWwuVmVydGV4QWlSZXNvdXJjZUNvbmZpZ0IT4j8QEg5nc + mFwaFN0b3JlUG9vbFIOZ3JhcGhTdG9yZVBvb2wSYwoMY29tcHV0ZV9wb29sGAIgASgLMi4uc25hcGNoYXQucmVzZWFyY2guZ2Jtb + C5WZXJ0ZXhBaVJlc291cmNlQ29uZmlnQhDiPw0SC2NvbXB1dGVQb29sUgtjb21wdXRlUG9vbBJpCiBjb21wdXRlX2NsdXN0ZXJfb + G9jYWxfd29ybGRfc2l6ZRgDIAEoBUIh4j8eEhxjb21wdXRlQ2x1c3RlckxvY2FsV29ybGRTaXplUhxjb21wdXRlQ2x1c3RlckxvY + 2FsV29ybGRTaXplIp0DChhEaXN0cmlidXRlZFRyYWluZXJDb25maWcShAEKGHZlcnRleF9haV90cmFpbmVyX2NvbmZpZxgBIAEoC + zItLnNuYXBjaGF0LnJlc2VhcmNoLmdibWwuVmVydGV4QWlUcmFpbmVyQ29uZmlnQhriPxcSFXZlcnRleEFpVHJhaW5lckNvbmZpZ + 0gAUhV2ZXJ0ZXhBaVRyYWluZXJDb25maWcSbwoSa2ZwX3RyYWluZXJfY29uZmlnGAIgASgLMiguc25hcGNoYXQucmVzZWFyY2guZ + 2JtbC5LRlBUcmFpbmVyQ29uZmlnQhXiPxISEGtmcFRyYWluZXJDb25maWdIAFIQa2ZwVHJhaW5lckNvbmZpZxJ3ChRsb2NhbF90c + mFpbmVyX2NvbmZpZxgDIAEoCzIqLnNuYXBjaGF0LnJlc2VhcmNoLmdibWwuTG9jYWxUcmFpbmVyQ29uZmlnQhfiPxQSEmxvY2FsV + HJhaW5lckNvbmZpZ0gAUhJsb2NhbFRyYWluZXJDb25maWdCEAoOdHJhaW5lcl9jb25maWcixwQKFVRyYWluZXJSZXNvdXJjZUNvb + mZpZxKFAQoYdmVydGV4X2FpX3RyYWluZXJfY29uZmlnGAEgASgLMi4uc25hcGNoYXQucmVzZWFyY2guZ2JtbC5WZXJ0ZXhBaVJlc + 291cmNlQ29uZmlnQhriPxcSFXZlcnRleEFpVHJhaW5lckNvbmZpZ0gAUhV2ZXJ0ZXhBaVRyYWluZXJDb25maWcScAoSa2ZwX3RyY + WluZXJfY29uZmlnGAIgASgLMikuc25hcGNoYXQucmVzZWFyY2guZ2JtbC5LRlBSZXNvdXJjZUNvbmZpZ0IV4j8SEhBrZnBUcmFpb + mVyQ29uZmlnSABSEGtmcFRyYWluZXJDb25maWcSeAoUbG9jYWxfdHJhaW5lcl9jb25maWcYAyABKAsyKy5zbmFwY2hhdC5yZXNlY + XJjaC5nYm1sLkxvY2FsUmVzb3VyY2VDb25maWdCF+I/FBISbG9jYWxUcmFpbmVyQ29uZmlnSABSEmxvY2FsVHJhaW5lckNvbmZpZ + xKnAQokdmVydGV4X2FpX2dyYXBoX3N0b3JlX3RyYWluZXJfY29uZmlnGAQgASgLMjAuc25hcGNoYXQucmVzZWFyY2guZ2JtbC5WZ + XJ0ZXhBaUdyYXBoU3RvcmVDb25maWdCJOI/IRIfdmVydGV4QWlHcmFwaFN0b3JlVHJhaW5lckNvbmZpZ0gAUh92ZXJ0ZXhBaUdyY + XBoU3RvcmVUcmFpbmVyQ29uZmlnQhAKDnRyYWluZXJfY29uZmlnIocFChhJbmZlcmVuY2VyUmVzb3VyY2VDb25maWcSjgEKG3Zlc + nRleF9haV9pbmZlcmVuY2VyX2NvbmZpZxgBIAEoCzIuLnNuYXBjaGF0LnJlc2VhcmNoLmdibWwuVmVydGV4QWlSZXNvdXJjZUNvb + mZpZ0Id4j8aEhh2ZXJ0ZXhBaUluZmVyZW5jZXJDb25maWdIAFIYdmVydGV4QWlJbmZlcmVuY2VyQ29uZmlnEo0BChpkYXRhZmxvd + 19pbmZlcmVuY2VyX2NvbmZpZxgCIAEoCzIuLnNuYXBjaGF0LnJlc2VhcmNoLmdibWwuRGF0YWZsb3dSZXNvdXJjZUNvbmZpZ0Id4 + j8aEhhkYXRhZmxvd0luZmVyZW5jZXJDb25maWdIAFIYZGF0YWZsb3dJbmZlcmVuY2VyQ29uZmlnEoEBChdsb2NhbF9pbmZlcmVuY + 2VyX2NvbmZpZxgDIAEoCzIrLnNuYXBjaGF0LnJlc2VhcmNoLmdibWwuTG9jYWxSZXNvdXJjZUNvbmZpZ0Ia4j8XEhVsb2NhbEluZ + mVyZW5jZXJDb25maWdIAFIVbG9jYWxJbmZlcmVuY2VyQ29uZmlnErABCid2ZXJ0ZXhfYWlfZ3JhcGhfc3RvcmVfaW5mZXJlbmNlc + l9jb25maWcYBCABKAsyMC5zbmFwY2hhdC5yZXNlYXJjaC5nYm1sLlZlcnRleEFpR3JhcGhTdG9yZUNvbmZpZ0In4j8kEiJ2ZXJ0Z + XhBaUdyYXBoU3RvcmVJbmZlcmVuY2VyQ29uZmlnSABSInZlcnRleEFpR3JhcGhTdG9yZUluZmVyZW5jZXJDb25maWdCEwoRaW5mZ + XJlbmNlcl9jb25maWcilwgKFFNoYXJlZFJlc291cmNlQ29uZmlnEn4KD3Jlc291cmNlX2xhYmVscxgBIAMoCzJALnNuYXBjaGF0L + nJlc2VhcmNoLmdibWwuU2hhcmVkUmVzb3VyY2VDb25maWcuUmVzb3VyY2VMYWJlbHNFbnRyeUIT4j8QEg5yZXNvdXJjZUxhYmVsc + 1IOcmVzb3VyY2VMYWJlbHMSjgEKFWNvbW1vbl9jb21wdXRlX2NvbmZpZxgCIAEoCzJALnNuYXBjaGF0LnJlc2VhcmNoLmdibWwuU + 2hhcmVkUmVzb3VyY2VDb25maWcuQ29tbW9uQ29tcHV0ZUNvbmZpZ0IY4j8VEhNjb21tb25Db21wdXRlQ29uZmlnUhNjb21tb25Db + 21wdXRlQ29uZmlnGpQFChNDb21tb25Db21wdXRlQ29uZmlnEiYKB3Byb2plY3QYASABKAlCDOI/CRIHcHJvamVjdFIHcHJvamVjd + BIjCgZyZWdpb24YAiABKAlCC+I/CBIGcmVnaW9uUgZyZWdpb24SQwoSdGVtcF9hc3NldHNfYnVja2V0GAMgASgJQhXiPxISEHRlb + XBBc3NldHNCdWNrZXRSEHRlbXBBc3NldHNCdWNrZXQSXAobdGVtcF9yZWdpb25hbF9hc3NldHNfYnVja2V0GAQgASgJQh3iPxoSG + HRlbXBSZWdpb25hbEFzc2V0c0J1Y2tldFIYdGVtcFJlZ2lvbmFsQXNzZXRzQnVja2V0EkMKEnBlcm1fYXNzZXRzX2J1Y2tldBgFI + AEoCUIV4j8SEhBwZXJtQXNzZXRzQnVja2V0UhBwZXJtQXNzZXRzQnVja2V0EloKG3RlbXBfYXNzZXRzX2JxX2RhdGFzZXRfbmFtZ + RgGIAEoCUIc4j8ZEhd0ZW1wQXNzZXRzQnFEYXRhc2V0TmFtZVIXdGVtcEFzc2V0c0JxRGF0YXNldE5hbWUSVgoZZW1iZWRkaW5nX + 2JxX2RhdGFzZXRfbmFtZRgHIAEoCUIb4j8YEhZlbWJlZGRpbmdCcURhdGFzZXROYW1lUhZlbWJlZGRpbmdCcURhdGFzZXROYW1lE + lYKGWdjcF9zZXJ2aWNlX2FjY291bnRfZW1haWwYCCABKAlCG+I/GBIWZ2NwU2VydmljZUFjY291bnRFbWFpbFIWZ2NwU2VydmljZ + UFjY291bnRFbWFpbBI8Cg9kYXRhZmxvd19ydW5uZXIYCyABKAlCE+I/EBIOZGF0YWZsb3dSdW5uZXJSDmRhdGFmbG93UnVubmVyG + lcKE1Jlc291cmNlTGFiZWxzRW50cnkSGgoDa2V5GAEgASgJQgjiPwUSA2tleVIDa2V5EiAKBXZhbHVlGAIgASgJQgriPwcSBXZhb + HVlUgV2YWx1ZToCOAEi9wgKEkdpZ2xSZXNvdXJjZUNvbmZpZxJbChpzaGFyZWRfcmVzb3VyY2VfY29uZmlnX3VyaRgBIAEoCUIc4 + j8ZEhdzaGFyZWRSZXNvdXJjZUNvbmZpZ1VyaUgAUhdzaGFyZWRSZXNvdXJjZUNvbmZpZ1VyaRJ/ChZzaGFyZWRfcmVzb3VyY2VfY + 29uZmlnGAIgASgLMiwuc25hcGNoYXQucmVzZWFyY2guZ2JtbC5TaGFyZWRSZXNvdXJjZUNvbmZpZ0IZ4j8WEhRzaGFyZWRSZXNvd + XJjZUNvbmZpZ0gAUhRzaGFyZWRSZXNvdXJjZUNvbmZpZxJ4ChNwcmVwcm9jZXNzb3JfY29uZmlnGAwgASgLMi4uc25hcGNoYXQuc + mVzZWFyY2guZ2JtbC5EYXRhUHJlcHJvY2Vzc29yQ29uZmlnQhfiPxQSEnByZXByb2Nlc3NvckNvbmZpZ1IScHJlcHJvY2Vzc29yQ + 29uZmlnEn8KF3N1YmdyYXBoX3NhbXBsZXJfY29uZmlnGA0gASgLMisuc25hcGNoYXQucmVzZWFyY2guZ2JtbC5TcGFya1Jlc291c + mNlQ29uZmlnQhriPxcSFXN1YmdyYXBoU2FtcGxlckNvbmZpZ1IVc3ViZ3JhcGhTYW1wbGVyQ29uZmlnEnwKFnNwbGl0X2dlbmVyY + XRvcl9jb25maWcYDiABKAsyKy5zbmFwY2hhdC5yZXNlYXJjaC5nYm1sLlNwYXJrUmVzb3VyY2VDb25maWdCGeI/FhIUc3BsaXRHZ + W5lcmF0b3JDb25maWdSFHNwbGl0R2VuZXJhdG9yQ29uZmlnEm0KDnRyYWluZXJfY29uZmlnGA8gASgLMjAuc25hcGNoYXQucmVzZ + WFyY2guZ2JtbC5EaXN0cmlidXRlZFRyYWluZXJDb25maWdCFBgB4j8PEg10cmFpbmVyQ29uZmlnUg10cmFpbmVyQ29uZmlnEnQKE + WluZmVyZW5jZXJfY29uZmlnGBAgASgLMi4uc25hcGNoYXQucmVzZWFyY2guZ2JtbC5EYXRhZmxvd1Jlc291cmNlQ29uZmlnQhcYA + eI/EhIQaW5mZXJlbmNlckNvbmZpZ1IQaW5mZXJlbmNlckNvbmZpZxKBAQoXdHJhaW5lcl9yZXNvdXJjZV9jb25maWcYESABKAsyL + S5zbmFwY2hhdC5yZXNlYXJjaC5nYm1sLlRyYWluZXJSZXNvdXJjZUNvbmZpZ0Ia4j8XEhV0cmFpbmVyUmVzb3VyY2VDb25maWdSF + XRyYWluZXJSZXNvdXJjZUNvbmZpZxKNAQoaaW5mZXJlbmNlcl9yZXNvdXJjZV9jb25maWcYEiABKAsyMC5zbmFwY2hhdC5yZXNlY + XJjaC5nYm1sLkluZmVyZW5jZXJSZXNvdXJjZUNvbmZpZ0Id4j8aEhhpbmZlcmVuY2VyUmVzb3VyY2VDb25maWdSGGluZmVyZW5jZ + XJSZXNvdXJjZUNvbmZpZ0IRCg9zaGFyZWRfcmVzb3VyY2Uq4wMKCUNvbXBvbmVudBItChFDb21wb25lbnRfVW5rbm93bhAAGhbiP + xMSEUNvbXBvbmVudF9Vbmtub3duEj8KGkNvbXBvbmVudF9Db25maWdfVmFsaWRhdG9yEAEaH+I/HBIaQ29tcG9uZW50X0NvbmZpZ + 19WYWxpZGF0b3ISPwoaQ29tcG9uZW50X0NvbmZpZ19Qb3B1bGF0b3IQAhof4j8cEhpDb21wb25lbnRfQ29uZmlnX1BvcHVsYXRvc + hJBChtDb21wb25lbnRfRGF0YV9QcmVwcm9jZXNzb3IQAxog4j8dEhtDb21wb25lbnRfRGF0YV9QcmVwcm9jZXNzb3ISPwoaQ29tc + G9uZW50X1N1YmdyYXBoX1NhbXBsZXIQBBof4j8cEhpDb21wb25lbnRfU3ViZ3JhcGhfU2FtcGxlchI9ChlDb21wb25lbnRfU3Bsa + XRfR2VuZXJhdG9yEAUaHuI/GxIZQ29tcG9uZW50X1NwbGl0X0dlbmVyYXRvchItChFDb21wb25lbnRfVHJhaW5lchAGGhbiPxMSE + UNvbXBvbmVudF9UcmFpbmVyEjMKFENvbXBvbmVudF9JbmZlcmVuY2VyEAcaGeI/FhIUQ29tcG9uZW50X0luZmVyZW5jZXJiBnByb + 3RvMw==""" ).mkString) lazy val scalaDescriptor: _root_.scalapb.descriptors.FileDescriptor = { val scalaProto = com.google.protobuf.descriptor.FileDescriptorProto.parseFrom(ProtoBytes) diff --git a/scala/common/src/main/scala/snapchat/research/gbml/gigl_resource_config/VertexAiResourceConfig.scala b/scala/common/src/main/scala/snapchat/research/gbml/gigl_resource_config/VertexAiResourceConfig.scala index 21f9ea1c2..d2394a65e 100644 --- a/scala/common/src/main/scala/snapchat/research/gbml/gigl_resource_config/VertexAiResourceConfig.scala +++ b/scala/common/src/main/scala/snapchat/research/gbml/gigl_resource_config/VertexAiResourceConfig.scala @@ -36,6 +36,9 @@ package snapchat.research.gbml.gigl_resource_config * @param reservationAffinity * Compute Engine reservation affinity for the job. * See https://docs.cloud.google.com/vertex-ai/docs/training/use-reservations + * @param tensorboardResourceName + * Existing Vertex AI TensorBoard resource to attach to the job. + * Format: projects/{project}/locations/{region}/tensorboards/{tensorboard_id} */ @SerialVersionUID(0L) final case class VertexAiResourceConfig( @@ -48,6 +51,7 @@ final case class VertexAiResourceConfig( schedulingStrategy: _root_.scala.Predef.String = "", bootDiskSizeGb: _root_.scala.Int = 0, reservationAffinity: _root_.scala.Option[snapchat.research.gbml.gigl_resource_config.VertexAiReservationAffinity] = _root_.scala.None, + tensorboardResourceName: _root_.scala.Predef.String = "", unknownFields: _root_.scalapb.UnknownFieldSet = _root_.scalapb.UnknownFieldSet.empty ) extends scalapb.GeneratedMessage with scalapb.lenses.Updatable[VertexAiResourceConfig] { @transient @@ -114,6 +118,13 @@ final case class VertexAiResourceConfig( val __value = reservationAffinity.get __size += 1 + _root_.com.google.protobuf.CodedOutputStream.computeUInt32SizeNoTag(__value.serializedSize) + __value.serializedSize }; + + { + val __value = tensorboardResourceName + if (!__value.isEmpty) { + __size += _root_.com.google.protobuf.CodedOutputStream.computeStringSize(10, __value) + } + }; __size += unknownFields.serializedSize __size } @@ -181,6 +192,12 @@ final case class VertexAiResourceConfig( _output__.writeUInt32NoTag(__m.serializedSize) __m.writeTo(_output__) }; + { + val __v = tensorboardResourceName + if (!__v.isEmpty) { + _output__.writeString(10, __v) + } + }; unknownFields.writeTo(_output__) } def withMachineType(__v: _root_.scala.Predef.String): VertexAiResourceConfig = copy(machineType = __v) @@ -194,6 +211,7 @@ final case class VertexAiResourceConfig( def getReservationAffinity: snapchat.research.gbml.gigl_resource_config.VertexAiReservationAffinity = reservationAffinity.getOrElse(snapchat.research.gbml.gigl_resource_config.VertexAiReservationAffinity.defaultInstance) def clearReservationAffinity: VertexAiResourceConfig = copy(reservationAffinity = _root_.scala.None) def withReservationAffinity(__v: snapchat.research.gbml.gigl_resource_config.VertexAiReservationAffinity): VertexAiResourceConfig = copy(reservationAffinity = Option(__v)) + def withTensorboardResourceName(__v: _root_.scala.Predef.String): VertexAiResourceConfig = copy(tensorboardResourceName = __v) def withUnknownFields(__v: _root_.scalapb.UnknownFieldSet) = copy(unknownFields = __v) def discardUnknownFields = copy(unknownFields = _root_.scalapb.UnknownFieldSet.empty) def getFieldByNumber(__fieldNumber: _root_.scala.Int): _root_.scala.Any = { @@ -231,6 +249,10 @@ final case class VertexAiResourceConfig( if (__t != 0) __t else null } case 9 => reservationAffinity.orNull + case 10 => { + val __t = tensorboardResourceName + if (__t != "") __t else null + } } } def getField(__field: _root_.scalapb.descriptors.FieldDescriptor): _root_.scalapb.descriptors.PValue = { @@ -245,6 +267,7 @@ final case class VertexAiResourceConfig( case 7 => _root_.scalapb.descriptors.PString(schedulingStrategy) case 8 => _root_.scalapb.descriptors.PInt(bootDiskSizeGb) case 9 => reservationAffinity.map(_.toPMessage).getOrElse(_root_.scalapb.descriptors.PEmpty) + case 10 => _root_.scalapb.descriptors.PString(tensorboardResourceName) } } def toProtoString: _root_.scala.Predef.String = _root_.scalapb.TextFormat.printToUnicodeString(this) @@ -264,6 +287,7 @@ object VertexAiResourceConfig extends scalapb.GeneratedMessageCompanion[snapchat var __schedulingStrategy: _root_.scala.Predef.String = "" var __bootDiskSizeGb: _root_.scala.Int = 0 var __reservationAffinity: _root_.scala.Option[snapchat.research.gbml.gigl_resource_config.VertexAiReservationAffinity] = _root_.scala.None + var __tensorboardResourceName: _root_.scala.Predef.String = "" var `_unknownFields__`: _root_.scalapb.UnknownFieldSet.Builder = null var _done__ = false while (!_done__) { @@ -288,6 +312,8 @@ object VertexAiResourceConfig extends scalapb.GeneratedMessageCompanion[snapchat __bootDiskSizeGb = _input__.readUInt32() case 74 => __reservationAffinity = Option(__reservationAffinity.fold(_root_.scalapb.LiteParser.readMessage[snapchat.research.gbml.gigl_resource_config.VertexAiReservationAffinity](_input__))(_root_.scalapb.LiteParser.readMessage(_input__, _))) + case 82 => + __tensorboardResourceName = _input__.readStringRequireUtf8() case tag => if (_unknownFields__ == null) { _unknownFields__ = new _root_.scalapb.UnknownFieldSet.Builder() @@ -305,6 +331,7 @@ object VertexAiResourceConfig extends scalapb.GeneratedMessageCompanion[snapchat schedulingStrategy = __schedulingStrategy, bootDiskSizeGb = __bootDiskSizeGb, reservationAffinity = __reservationAffinity, + tensorboardResourceName = __tensorboardResourceName, unknownFields = if (_unknownFields__ == null) _root_.scalapb.UnknownFieldSet.empty else _unknownFields__.result() ) } @@ -320,7 +347,8 @@ object VertexAiResourceConfig extends scalapb.GeneratedMessageCompanion[snapchat gcpRegionOverride = __fieldsMap.get(scalaDescriptor.findFieldByNumber(6).get).map(_.as[_root_.scala.Predef.String]).getOrElse(""), schedulingStrategy = __fieldsMap.get(scalaDescriptor.findFieldByNumber(7).get).map(_.as[_root_.scala.Predef.String]).getOrElse(""), bootDiskSizeGb = __fieldsMap.get(scalaDescriptor.findFieldByNumber(8).get).map(_.as[_root_.scala.Int]).getOrElse(0), - reservationAffinity = __fieldsMap.get(scalaDescriptor.findFieldByNumber(9).get).flatMap(_.as[_root_.scala.Option[snapchat.research.gbml.gigl_resource_config.VertexAiReservationAffinity]]) + reservationAffinity = __fieldsMap.get(scalaDescriptor.findFieldByNumber(9).get).flatMap(_.as[_root_.scala.Option[snapchat.research.gbml.gigl_resource_config.VertexAiReservationAffinity]]), + tensorboardResourceName = __fieldsMap.get(scalaDescriptor.findFieldByNumber(10).get).map(_.as[_root_.scala.Predef.String]).getOrElse("") ) case _ => throw new RuntimeException("Expected PMessage") } @@ -344,7 +372,8 @@ object VertexAiResourceConfig extends scalapb.GeneratedMessageCompanion[snapchat gcpRegionOverride = "", schedulingStrategy = "", bootDiskSizeGb = 0, - reservationAffinity = _root_.scala.None + reservationAffinity = _root_.scala.None, + tensorboardResourceName = "" ) implicit class VertexAiResourceConfigLens[UpperPB](_l: _root_.scalapb.lenses.Lens[UpperPB, snapchat.research.gbml.gigl_resource_config.VertexAiResourceConfig]) extends _root_.scalapb.lenses.ObjectLens[UpperPB, snapchat.research.gbml.gigl_resource_config.VertexAiResourceConfig](_l) { def machineType: _root_.scalapb.lenses.Lens[UpperPB, _root_.scala.Predef.String] = field(_.machineType)((c_, f_) => c_.copy(machineType = f_)) @@ -357,6 +386,7 @@ object VertexAiResourceConfig extends scalapb.GeneratedMessageCompanion[snapchat def bootDiskSizeGb: _root_.scalapb.lenses.Lens[UpperPB, _root_.scala.Int] = field(_.bootDiskSizeGb)((c_, f_) => c_.copy(bootDiskSizeGb = f_)) def reservationAffinity: _root_.scalapb.lenses.Lens[UpperPB, snapchat.research.gbml.gigl_resource_config.VertexAiReservationAffinity] = field(_.getReservationAffinity)((c_, f_) => c_.copy(reservationAffinity = Option(f_))) def optionalReservationAffinity: _root_.scalapb.lenses.Lens[UpperPB, _root_.scala.Option[snapchat.research.gbml.gigl_resource_config.VertexAiReservationAffinity]] = field(_.reservationAffinity)((c_, f_) => c_.copy(reservationAffinity = f_)) + def tensorboardResourceName: _root_.scalapb.lenses.Lens[UpperPB, _root_.scala.Predef.String] = field(_.tensorboardResourceName)((c_, f_) => c_.copy(tensorboardResourceName = f_)) } final val MACHINE_TYPE_FIELD_NUMBER = 1 final val GPU_TYPE_FIELD_NUMBER = 2 @@ -367,6 +397,7 @@ object VertexAiResourceConfig extends scalapb.GeneratedMessageCompanion[snapchat final val SCHEDULING_STRATEGY_FIELD_NUMBER = 7 final val BOOT_DISK_SIZE_GB_FIELD_NUMBER = 8 final val RESERVATION_AFFINITY_FIELD_NUMBER = 9 + final val TENSORBOARD_RESOURCE_NAME_FIELD_NUMBER = 10 def of( machineType: _root_.scala.Predef.String, gpuType: _root_.scala.Predef.String, @@ -376,7 +407,8 @@ object VertexAiResourceConfig extends scalapb.GeneratedMessageCompanion[snapchat gcpRegionOverride: _root_.scala.Predef.String, schedulingStrategy: _root_.scala.Predef.String, bootDiskSizeGb: _root_.scala.Int, - reservationAffinity: _root_.scala.Option[snapchat.research.gbml.gigl_resource_config.VertexAiReservationAffinity] + reservationAffinity: _root_.scala.Option[snapchat.research.gbml.gigl_resource_config.VertexAiReservationAffinity], + tensorboardResourceName: _root_.scala.Predef.String ): _root_.snapchat.research.gbml.gigl_resource_config.VertexAiResourceConfig = _root_.snapchat.research.gbml.gigl_resource_config.VertexAiResourceConfig( machineType, gpuType, @@ -386,7 +418,8 @@ object VertexAiResourceConfig extends scalapb.GeneratedMessageCompanion[snapchat gcpRegionOverride, schedulingStrategy, bootDiskSizeGb, - reservationAffinity + reservationAffinity, + tensorboardResourceName ) // @@protoc_insertion_point(GeneratedMessageCompanion[snapchat.research.gbml.VertexAiResourceConfig]) } diff --git a/scala_spark35/common/src/main/scala/snapchat/research/gbml/gigl_resource_config/DistributedInferencerConfig.scala b/scala_spark35/common/src/main/scala/snapchat/research/gbml/gigl_resource_config/DistributedInferencerConfig.scala index 8363bdb1f..2198a2eb5 100644 --- a/scala_spark35/common/src/main/scala/snapchat/research/gbml/gigl_resource_config/DistributedInferencerConfig.scala +++ b/scala_spark35/common/src/main/scala/snapchat/research/gbml/gigl_resource_config/DistributedInferencerConfig.scala @@ -38,7 +38,7 @@ final case class DistributedInferencerConfig( __serializedSizeMemoized = __size } __size - 1 - + } def writeTo(`_output__`: _root_.com.google.protobuf.CodedOutputStream): _root_.scala.Unit = { trainerConfig.vertexAiInferencerConfig.foreach { __v => @@ -165,7 +165,7 @@ object DistributedInferencerConfig extends scalapb.GeneratedMessageCompanion[sna override def number: _root_.scala.Int = 0 override def value: _root_.scala.Nothing = throw new java.util.NoSuchElementException("Empty.value") } - + @SerialVersionUID(0L) final case class VertexAiInferencerConfig(value: snapchat.research.gbml.gigl_resource_config.VertexAiResourceConfig) extends snapchat.research.gbml.gigl_resource_config.DistributedInferencerConfig.TrainerConfig { type ValueType = snapchat.research.gbml.gigl_resource_config.VertexAiResourceConfig diff --git a/scala_spark35/common/src/main/scala/snapchat/research/gbml/gigl_resource_config/GiglResourceConfigProto.scala b/scala_spark35/common/src/main/scala/snapchat/research/gbml/gigl_resource_config/GiglResourceConfigProto.scala index a086f6113..94ffd417b 100644 --- a/scala_spark35/common/src/main/scala/snapchat/research/gbml/gigl_resource_config/GiglResourceConfigProto.scala +++ b/scala_spark35/common/src/main/scala/snapchat/research/gbml/gigl_resource_config/GiglResourceConfigProto.scala @@ -48,7 +48,7 @@ object GiglResourceConfigProto extends _root_.scalapb.GeneratedFileObject { XQSMwoMbnVtX3JlcGxpY2FzGAUgASgNQhDiPw0SC251bVJlcGxpY2FzUgtudW1SZXBsaWNhcyJGChJMb2NhbFRyYWluZXJDb25ma WcSMAoLbnVtX3dvcmtlcnMYASABKA1CD+I/DBIKbnVtV29ya2Vyc1IKbnVtV29ya2VycyKZAQobVmVydGV4QWlSZXNlcnZhdGlvb kFmZmluaXR5Eh0KBHR5cGUYASABKAlCCeI/BhIEdHlwZVIEdHlwZRJbChpyZXNlcnZhdGlvbl9yZXNvdXJjZV9uYW1lcxgCIAMoC - UId4j8aEhhyZXNlcnZhdGlvblJlc291cmNlTmFtZXNSGHJlc2VydmF0aW9uUmVzb3VyY2VOYW1lcyLUBAoWVmVydGV4QWlSZXNvd + UId4j8aEhhyZXNlcnZhdGlvblJlc291cmNlTmFtZXNSGHJlc2VydmF0aW9uUmVzb3VyY2VOYW1lcyKuBQoWVmVydGV4QWlSZXNvd XJjZUNvbmZpZxIzCgxtYWNoaW5lX3R5cGUYASABKAlCEOI/DRILbWFjaGluZVR5cGVSC21hY2hpbmVUeXBlEicKCGdwdV90eXBlG AIgASgJQgziPwkSB2dwdVR5cGVSB2dwdVR5cGUSKgoJZ3B1X2xpbWl0GAMgASgNQg3iPwoSCGdwdUxpbWl0UghncHVMaW1pdBIzC gxudW1fcmVwbGljYXMYBCABKA1CEOI/DRILbnVtUmVwbGljYXNSC251bVJlcGxpY2FzEiYKB3RpbWVvdXQYBSABKA1CDOI/CRIHd @@ -56,74 +56,76 @@ object GiglResourceConfigProto extends _root_.scalapb.GeneratedFileObject { Wdpb25PdmVycmlkZRJIChNzY2hlZHVsaW5nX3N0cmF0ZWd5GAcgASgJQhfiPxQSEnNjaGVkdWxpbmdTdHJhdGVneVISc2NoZWR1b GluZ1N0cmF0ZWd5Ej4KEWJvb3RfZGlza19zaXplX2diGAggASgNQhPiPxASDmJvb3REaXNrU2l6ZUdiUg5ib290RGlza1NpemVHY hKAAQoUcmVzZXJ2YXRpb25fYWZmaW5pdHkYCSABKAsyMy5zbmFwY2hhdC5yZXNlYXJjaC5nYm1sLlZlcnRleEFpUmVzZXJ2YXRpb - 25BZmZpbml0eUIY4j8VEhNyZXNlcnZhdGlvbkFmZmluaXR5UhNyZXNlcnZhdGlvbkFmZmluaXR5IooCChFLRlBSZXNvdXJjZUNvb - mZpZxIwCgtjcHVfcmVxdWVzdBgBIAEoCUIP4j8MEgpjcHVSZXF1ZXN0UgpjcHVSZXF1ZXN0EjkKDm1lbW9yeV9yZXF1ZXN0GAIgA - SgJQhLiPw8SDW1lbW9yeVJlcXVlc3RSDW1lbW9yeVJlcXVlc3QSJwoIZ3B1X3R5cGUYAyABKAlCDOI/CRIHZ3B1VHlwZVIHZ3B1V - HlwZRIqCglncHVfbGltaXQYBCABKA1CDeI/ChIIZ3B1TGltaXRSCGdwdUxpbWl0EjMKDG51bV9yZXBsaWNhcxgFIAEoDUIQ4j8NE - gtudW1SZXBsaWNhc1ILbnVtUmVwbGljYXMiRwoTTG9jYWxSZXNvdXJjZUNvbmZpZxIwCgtudW1fd29ya2VycxgBIAEoDUIP4j8ME - gpudW1Xb3JrZXJzUgpudW1Xb3JrZXJzItkCChhWZXJ0ZXhBaUdyYXBoU3RvcmVDb25maWcSbQoQZ3JhcGhfc3RvcmVfcG9vbBgBI - AEoCzIuLnNuYXBjaGF0LnJlc2VhcmNoLmdibWwuVmVydGV4QWlSZXNvdXJjZUNvbmZpZ0IT4j8QEg5ncmFwaFN0b3JlUG9vbFIOZ - 3JhcGhTdG9yZVBvb2wSYwoMY29tcHV0ZV9wb29sGAIgASgLMi4uc25hcGNoYXQucmVzZWFyY2guZ2JtbC5WZXJ0ZXhBaVJlc291c - mNlQ29uZmlnQhDiPw0SC2NvbXB1dGVQb29sUgtjb21wdXRlUG9vbBJpCiBjb21wdXRlX2NsdXN0ZXJfbG9jYWxfd29ybGRfc2l6Z - RgDIAEoBUIh4j8eEhxjb21wdXRlQ2x1c3RlckxvY2FsV29ybGRTaXplUhxjb21wdXRlQ2x1c3RlckxvY2FsV29ybGRTaXplIp0DC - hhEaXN0cmlidXRlZFRyYWluZXJDb25maWcShAEKGHZlcnRleF9haV90cmFpbmVyX2NvbmZpZxgBIAEoCzItLnNuYXBjaGF0LnJlc - 2VhcmNoLmdibWwuVmVydGV4QWlUcmFpbmVyQ29uZmlnQhriPxcSFXZlcnRleEFpVHJhaW5lckNvbmZpZ0gAUhV2ZXJ0ZXhBaVRyY - WluZXJDb25maWcSbwoSa2ZwX3RyYWluZXJfY29uZmlnGAIgASgLMiguc25hcGNoYXQucmVzZWFyY2guZ2JtbC5LRlBUcmFpbmVyQ - 29uZmlnQhXiPxISEGtmcFRyYWluZXJDb25maWdIAFIQa2ZwVHJhaW5lckNvbmZpZxJ3ChRsb2NhbF90cmFpbmVyX2NvbmZpZxgDI - AEoCzIqLnNuYXBjaGF0LnJlc2VhcmNoLmdibWwuTG9jYWxUcmFpbmVyQ29uZmlnQhfiPxQSEmxvY2FsVHJhaW5lckNvbmZpZ0gAU - hJsb2NhbFRyYWluZXJDb25maWdCEAoOdHJhaW5lcl9jb25maWcixwQKFVRyYWluZXJSZXNvdXJjZUNvbmZpZxKFAQoYdmVydGV4X - 2FpX3RyYWluZXJfY29uZmlnGAEgASgLMi4uc25hcGNoYXQucmVzZWFyY2guZ2JtbC5WZXJ0ZXhBaVJlc291cmNlQ29uZmlnQhriP - xcSFXZlcnRleEFpVHJhaW5lckNvbmZpZ0gAUhV2ZXJ0ZXhBaVRyYWluZXJDb25maWcScAoSa2ZwX3RyYWluZXJfY29uZmlnGAIgA - SgLMikuc25hcGNoYXQucmVzZWFyY2guZ2JtbC5LRlBSZXNvdXJjZUNvbmZpZ0IV4j8SEhBrZnBUcmFpbmVyQ29uZmlnSABSEGtmc - FRyYWluZXJDb25maWcSeAoUbG9jYWxfdHJhaW5lcl9jb25maWcYAyABKAsyKy5zbmFwY2hhdC5yZXNlYXJjaC5nYm1sLkxvY2FsU - mVzb3VyY2VDb25maWdCF+I/FBISbG9jYWxUcmFpbmVyQ29uZmlnSABSEmxvY2FsVHJhaW5lckNvbmZpZxKnAQokdmVydGV4X2FpX - 2dyYXBoX3N0b3JlX3RyYWluZXJfY29uZmlnGAQgASgLMjAuc25hcGNoYXQucmVzZWFyY2guZ2JtbC5WZXJ0ZXhBaUdyYXBoU3Rvc - mVDb25maWdCJOI/IRIfdmVydGV4QWlHcmFwaFN0b3JlVHJhaW5lckNvbmZpZ0gAUh92ZXJ0ZXhBaUdyYXBoU3RvcmVUcmFpbmVyQ - 29uZmlnQhAKDnRyYWluZXJfY29uZmlnIocFChhJbmZlcmVuY2VyUmVzb3VyY2VDb25maWcSjgEKG3ZlcnRleF9haV9pbmZlcmVuY - 2VyX2NvbmZpZxgBIAEoCzIuLnNuYXBjaGF0LnJlc2VhcmNoLmdibWwuVmVydGV4QWlSZXNvdXJjZUNvbmZpZ0Id4j8aEhh2ZXJ0Z - XhBaUluZmVyZW5jZXJDb25maWdIAFIYdmVydGV4QWlJbmZlcmVuY2VyQ29uZmlnEo0BChpkYXRhZmxvd19pbmZlcmVuY2VyX2Nvb - mZpZxgCIAEoCzIuLnNuYXBjaGF0LnJlc2VhcmNoLmdibWwuRGF0YWZsb3dSZXNvdXJjZUNvbmZpZ0Id4j8aEhhkYXRhZmxvd0luZ - mVyZW5jZXJDb25maWdIAFIYZGF0YWZsb3dJbmZlcmVuY2VyQ29uZmlnEoEBChdsb2NhbF9pbmZlcmVuY2VyX2NvbmZpZxgDIAEoC - zIrLnNuYXBjaGF0LnJlc2VhcmNoLmdibWwuTG9jYWxSZXNvdXJjZUNvbmZpZ0Ia4j8XEhVsb2NhbEluZmVyZW5jZXJDb25maWdIA - FIVbG9jYWxJbmZlcmVuY2VyQ29uZmlnErABCid2ZXJ0ZXhfYWlfZ3JhcGhfc3RvcmVfaW5mZXJlbmNlcl9jb25maWcYBCABKAsyM - C5zbmFwY2hhdC5yZXNlYXJjaC5nYm1sLlZlcnRleEFpR3JhcGhTdG9yZUNvbmZpZ0In4j8kEiJ2ZXJ0ZXhBaUdyYXBoU3RvcmVJb - mZlcmVuY2VyQ29uZmlnSABSInZlcnRleEFpR3JhcGhTdG9yZUluZmVyZW5jZXJDb25maWdCEwoRaW5mZXJlbmNlcl9jb25maWcil - wgKFFNoYXJlZFJlc291cmNlQ29uZmlnEn4KD3Jlc291cmNlX2xhYmVscxgBIAMoCzJALnNuYXBjaGF0LnJlc2VhcmNoLmdibWwuU - 2hhcmVkUmVzb3VyY2VDb25maWcuUmVzb3VyY2VMYWJlbHNFbnRyeUIT4j8QEg5yZXNvdXJjZUxhYmVsc1IOcmVzb3VyY2VMYWJlb - HMSjgEKFWNvbW1vbl9jb21wdXRlX2NvbmZpZxgCIAEoCzJALnNuYXBjaGF0LnJlc2VhcmNoLmdibWwuU2hhcmVkUmVzb3VyY2VDb - 25maWcuQ29tbW9uQ29tcHV0ZUNvbmZpZ0IY4j8VEhNjb21tb25Db21wdXRlQ29uZmlnUhNjb21tb25Db21wdXRlQ29uZmlnGpQFC - hNDb21tb25Db21wdXRlQ29uZmlnEiYKB3Byb2plY3QYASABKAlCDOI/CRIHcHJvamVjdFIHcHJvamVjdBIjCgZyZWdpb24YAiABK - AlCC+I/CBIGcmVnaW9uUgZyZWdpb24SQwoSdGVtcF9hc3NldHNfYnVja2V0GAMgASgJQhXiPxISEHRlbXBBc3NldHNCdWNrZXRSE - HRlbXBBc3NldHNCdWNrZXQSXAobdGVtcF9yZWdpb25hbF9hc3NldHNfYnVja2V0GAQgASgJQh3iPxoSGHRlbXBSZWdpb25hbEFzc - 2V0c0J1Y2tldFIYdGVtcFJlZ2lvbmFsQXNzZXRzQnVja2V0EkMKEnBlcm1fYXNzZXRzX2J1Y2tldBgFIAEoCUIV4j8SEhBwZXJtQ - XNzZXRzQnVja2V0UhBwZXJtQXNzZXRzQnVja2V0EloKG3RlbXBfYXNzZXRzX2JxX2RhdGFzZXRfbmFtZRgGIAEoCUIc4j8ZEhd0Z - W1wQXNzZXRzQnFEYXRhc2V0TmFtZVIXdGVtcEFzc2V0c0JxRGF0YXNldE5hbWUSVgoZZW1iZWRkaW5nX2JxX2RhdGFzZXRfbmFtZ - RgHIAEoCUIb4j8YEhZlbWJlZGRpbmdCcURhdGFzZXROYW1lUhZlbWJlZGRpbmdCcURhdGFzZXROYW1lElYKGWdjcF9zZXJ2aWNlX - 2FjY291bnRfZW1haWwYCCABKAlCG+I/GBIWZ2NwU2VydmljZUFjY291bnRFbWFpbFIWZ2NwU2VydmljZUFjY291bnRFbWFpbBI8C - g9kYXRhZmxvd19ydW5uZXIYCyABKAlCE+I/EBIOZGF0YWZsb3dSdW5uZXJSDmRhdGFmbG93UnVubmVyGlcKE1Jlc291cmNlTGFiZ - WxzRW50cnkSGgoDa2V5GAEgASgJQgjiPwUSA2tleVIDa2V5EiAKBXZhbHVlGAIgASgJQgriPwcSBXZhbHVlUgV2YWx1ZToCOAEi9 - wgKEkdpZ2xSZXNvdXJjZUNvbmZpZxJbChpzaGFyZWRfcmVzb3VyY2VfY29uZmlnX3VyaRgBIAEoCUIc4j8ZEhdzaGFyZWRSZXNvd - XJjZUNvbmZpZ1VyaUgAUhdzaGFyZWRSZXNvdXJjZUNvbmZpZ1VyaRJ/ChZzaGFyZWRfcmVzb3VyY2VfY29uZmlnGAIgASgLMiwuc - 25hcGNoYXQucmVzZWFyY2guZ2JtbC5TaGFyZWRSZXNvdXJjZUNvbmZpZ0IZ4j8WEhRzaGFyZWRSZXNvdXJjZUNvbmZpZ0gAUhRza - GFyZWRSZXNvdXJjZUNvbmZpZxJ4ChNwcmVwcm9jZXNzb3JfY29uZmlnGAwgASgLMi4uc25hcGNoYXQucmVzZWFyY2guZ2JtbC5EY - XRhUHJlcHJvY2Vzc29yQ29uZmlnQhfiPxQSEnByZXByb2Nlc3NvckNvbmZpZ1IScHJlcHJvY2Vzc29yQ29uZmlnEn8KF3N1YmdyY - XBoX3NhbXBsZXJfY29uZmlnGA0gASgLMisuc25hcGNoYXQucmVzZWFyY2guZ2JtbC5TcGFya1Jlc291cmNlQ29uZmlnQhriPxcSF - XN1YmdyYXBoU2FtcGxlckNvbmZpZ1IVc3ViZ3JhcGhTYW1wbGVyQ29uZmlnEnwKFnNwbGl0X2dlbmVyYXRvcl9jb25maWcYDiABK - AsyKy5zbmFwY2hhdC5yZXNlYXJjaC5nYm1sLlNwYXJrUmVzb3VyY2VDb25maWdCGeI/FhIUc3BsaXRHZW5lcmF0b3JDb25maWdSF - HNwbGl0R2VuZXJhdG9yQ29uZmlnEm0KDnRyYWluZXJfY29uZmlnGA8gASgLMjAuc25hcGNoYXQucmVzZWFyY2guZ2JtbC5EaXN0c - mlidXRlZFRyYWluZXJDb25maWdCFBgB4j8PEg10cmFpbmVyQ29uZmlnUg10cmFpbmVyQ29uZmlnEnQKEWluZmVyZW5jZXJfY29uZ - mlnGBAgASgLMi4uc25hcGNoYXQucmVzZWFyY2guZ2JtbC5EYXRhZmxvd1Jlc291cmNlQ29uZmlnQhcYAeI/EhIQaW5mZXJlbmNlc - kNvbmZpZ1IQaW5mZXJlbmNlckNvbmZpZxKBAQoXdHJhaW5lcl9yZXNvdXJjZV9jb25maWcYESABKAsyLS5zbmFwY2hhdC5yZXNlY - XJjaC5nYm1sLlRyYWluZXJSZXNvdXJjZUNvbmZpZ0Ia4j8XEhV0cmFpbmVyUmVzb3VyY2VDb25maWdSFXRyYWluZXJSZXNvdXJjZ - UNvbmZpZxKNAQoaaW5mZXJlbmNlcl9yZXNvdXJjZV9jb25maWcYEiABKAsyMC5zbmFwY2hhdC5yZXNlYXJjaC5nYm1sLkluZmVyZ - W5jZXJSZXNvdXJjZUNvbmZpZ0Id4j8aEhhpbmZlcmVuY2VyUmVzb3VyY2VDb25maWdSGGluZmVyZW5jZXJSZXNvdXJjZUNvbmZpZ - 0IRCg9zaGFyZWRfcmVzb3VyY2Uq4wMKCUNvbXBvbmVudBItChFDb21wb25lbnRfVW5rbm93bhAAGhbiPxMSEUNvbXBvbmVudF9Vb - mtub3duEj8KGkNvbXBvbmVudF9Db25maWdfVmFsaWRhdG9yEAEaH+I/HBIaQ29tcG9uZW50X0NvbmZpZ19WYWxpZGF0b3ISPwoaQ - 29tcG9uZW50X0NvbmZpZ19Qb3B1bGF0b3IQAhof4j8cEhpDb21wb25lbnRfQ29uZmlnX1BvcHVsYXRvchJBChtDb21wb25lbnRfR - GF0YV9QcmVwcm9jZXNzb3IQAxog4j8dEhtDb21wb25lbnRfRGF0YV9QcmVwcm9jZXNzb3ISPwoaQ29tcG9uZW50X1N1YmdyYXBoX - 1NhbXBsZXIQBBof4j8cEhpDb21wb25lbnRfU3ViZ3JhcGhfU2FtcGxlchI9ChlDb21wb25lbnRfU3BsaXRfR2VuZXJhdG9yEAUaH - uI/GxIZQ29tcG9uZW50X1NwbGl0X0dlbmVyYXRvchItChFDb21wb25lbnRfVHJhaW5lchAGGhbiPxMSEUNvbXBvbmVudF9UcmFpb - mVyEjMKFENvbXBvbmVudF9JbmZlcmVuY2VyEAcaGeI/FhIUQ29tcG9uZW50X0luZmVyZW5jZXJiBnByb3RvMw==""" + 25BZmZpbml0eUIY4j8VEhNyZXNlcnZhdGlvbkFmZmluaXR5UhNyZXNlcnZhdGlvbkFmZmluaXR5ElgKGXRlbnNvcmJvYXJkX3Jlc + 291cmNlX25hbWUYCiABKAlCHOI/GRIXdGVuc29yYm9hcmRSZXNvdXJjZU5hbWVSF3RlbnNvcmJvYXJkUmVzb3VyY2VOYW1lIooCC + hFLRlBSZXNvdXJjZUNvbmZpZxIwCgtjcHVfcmVxdWVzdBgBIAEoCUIP4j8MEgpjcHVSZXF1ZXN0UgpjcHVSZXF1ZXN0EjkKDm1lb + W9yeV9yZXF1ZXN0GAIgASgJQhLiPw8SDW1lbW9yeVJlcXVlc3RSDW1lbW9yeVJlcXVlc3QSJwoIZ3B1X3R5cGUYAyABKAlCDOI/C + RIHZ3B1VHlwZVIHZ3B1VHlwZRIqCglncHVfbGltaXQYBCABKA1CDeI/ChIIZ3B1TGltaXRSCGdwdUxpbWl0EjMKDG51bV9yZXBsa + WNhcxgFIAEoDUIQ4j8NEgtudW1SZXBsaWNhc1ILbnVtUmVwbGljYXMiRwoTTG9jYWxSZXNvdXJjZUNvbmZpZxIwCgtudW1fd29ya + 2VycxgBIAEoDUIP4j8MEgpudW1Xb3JrZXJzUgpudW1Xb3JrZXJzItkCChhWZXJ0ZXhBaUdyYXBoU3RvcmVDb25maWcSbQoQZ3Jhc + Ghfc3RvcmVfcG9vbBgBIAEoCzIuLnNuYXBjaGF0LnJlc2VhcmNoLmdibWwuVmVydGV4QWlSZXNvdXJjZUNvbmZpZ0IT4j8QEg5nc + mFwaFN0b3JlUG9vbFIOZ3JhcGhTdG9yZVBvb2wSYwoMY29tcHV0ZV9wb29sGAIgASgLMi4uc25hcGNoYXQucmVzZWFyY2guZ2Jtb + C5WZXJ0ZXhBaVJlc291cmNlQ29uZmlnQhDiPw0SC2NvbXB1dGVQb29sUgtjb21wdXRlUG9vbBJpCiBjb21wdXRlX2NsdXN0ZXJfb + G9jYWxfd29ybGRfc2l6ZRgDIAEoBUIh4j8eEhxjb21wdXRlQ2x1c3RlckxvY2FsV29ybGRTaXplUhxjb21wdXRlQ2x1c3RlckxvY + 2FsV29ybGRTaXplIp0DChhEaXN0cmlidXRlZFRyYWluZXJDb25maWcShAEKGHZlcnRleF9haV90cmFpbmVyX2NvbmZpZxgBIAEoC + zItLnNuYXBjaGF0LnJlc2VhcmNoLmdibWwuVmVydGV4QWlUcmFpbmVyQ29uZmlnQhriPxcSFXZlcnRleEFpVHJhaW5lckNvbmZpZ + 0gAUhV2ZXJ0ZXhBaVRyYWluZXJDb25maWcSbwoSa2ZwX3RyYWluZXJfY29uZmlnGAIgASgLMiguc25hcGNoYXQucmVzZWFyY2guZ + 2JtbC5LRlBUcmFpbmVyQ29uZmlnQhXiPxISEGtmcFRyYWluZXJDb25maWdIAFIQa2ZwVHJhaW5lckNvbmZpZxJ3ChRsb2NhbF90c + mFpbmVyX2NvbmZpZxgDIAEoCzIqLnNuYXBjaGF0LnJlc2VhcmNoLmdibWwuTG9jYWxUcmFpbmVyQ29uZmlnQhfiPxQSEmxvY2FsV + HJhaW5lckNvbmZpZ0gAUhJsb2NhbFRyYWluZXJDb25maWdCEAoOdHJhaW5lcl9jb25maWcixwQKFVRyYWluZXJSZXNvdXJjZUNvb + mZpZxKFAQoYdmVydGV4X2FpX3RyYWluZXJfY29uZmlnGAEgASgLMi4uc25hcGNoYXQucmVzZWFyY2guZ2JtbC5WZXJ0ZXhBaVJlc + 291cmNlQ29uZmlnQhriPxcSFXZlcnRleEFpVHJhaW5lckNvbmZpZ0gAUhV2ZXJ0ZXhBaVRyYWluZXJDb25maWcScAoSa2ZwX3RyY + WluZXJfY29uZmlnGAIgASgLMikuc25hcGNoYXQucmVzZWFyY2guZ2JtbC5LRlBSZXNvdXJjZUNvbmZpZ0IV4j8SEhBrZnBUcmFpb + mVyQ29uZmlnSABSEGtmcFRyYWluZXJDb25maWcSeAoUbG9jYWxfdHJhaW5lcl9jb25maWcYAyABKAsyKy5zbmFwY2hhdC5yZXNlY + XJjaC5nYm1sLkxvY2FsUmVzb3VyY2VDb25maWdCF+I/FBISbG9jYWxUcmFpbmVyQ29uZmlnSABSEmxvY2FsVHJhaW5lckNvbmZpZ + xKnAQokdmVydGV4X2FpX2dyYXBoX3N0b3JlX3RyYWluZXJfY29uZmlnGAQgASgLMjAuc25hcGNoYXQucmVzZWFyY2guZ2JtbC5WZ + XJ0ZXhBaUdyYXBoU3RvcmVDb25maWdCJOI/IRIfdmVydGV4QWlHcmFwaFN0b3JlVHJhaW5lckNvbmZpZ0gAUh92ZXJ0ZXhBaUdyY + XBoU3RvcmVUcmFpbmVyQ29uZmlnQhAKDnRyYWluZXJfY29uZmlnIocFChhJbmZlcmVuY2VyUmVzb3VyY2VDb25maWcSjgEKG3Zlc + nRleF9haV9pbmZlcmVuY2VyX2NvbmZpZxgBIAEoCzIuLnNuYXBjaGF0LnJlc2VhcmNoLmdibWwuVmVydGV4QWlSZXNvdXJjZUNvb + mZpZ0Id4j8aEhh2ZXJ0ZXhBaUluZmVyZW5jZXJDb25maWdIAFIYdmVydGV4QWlJbmZlcmVuY2VyQ29uZmlnEo0BChpkYXRhZmxvd + 19pbmZlcmVuY2VyX2NvbmZpZxgCIAEoCzIuLnNuYXBjaGF0LnJlc2VhcmNoLmdibWwuRGF0YWZsb3dSZXNvdXJjZUNvbmZpZ0Id4 + j8aEhhkYXRhZmxvd0luZmVyZW5jZXJDb25maWdIAFIYZGF0YWZsb3dJbmZlcmVuY2VyQ29uZmlnEoEBChdsb2NhbF9pbmZlcmVuY + 2VyX2NvbmZpZxgDIAEoCzIrLnNuYXBjaGF0LnJlc2VhcmNoLmdibWwuTG9jYWxSZXNvdXJjZUNvbmZpZ0Ia4j8XEhVsb2NhbEluZ + mVyZW5jZXJDb25maWdIAFIVbG9jYWxJbmZlcmVuY2VyQ29uZmlnErABCid2ZXJ0ZXhfYWlfZ3JhcGhfc3RvcmVfaW5mZXJlbmNlc + l9jb25maWcYBCABKAsyMC5zbmFwY2hhdC5yZXNlYXJjaC5nYm1sLlZlcnRleEFpR3JhcGhTdG9yZUNvbmZpZ0In4j8kEiJ2ZXJ0Z + XhBaUdyYXBoU3RvcmVJbmZlcmVuY2VyQ29uZmlnSABSInZlcnRleEFpR3JhcGhTdG9yZUluZmVyZW5jZXJDb25maWdCEwoRaW5mZ + XJlbmNlcl9jb25maWcilwgKFFNoYXJlZFJlc291cmNlQ29uZmlnEn4KD3Jlc291cmNlX2xhYmVscxgBIAMoCzJALnNuYXBjaGF0L + nJlc2VhcmNoLmdibWwuU2hhcmVkUmVzb3VyY2VDb25maWcuUmVzb3VyY2VMYWJlbHNFbnRyeUIT4j8QEg5yZXNvdXJjZUxhYmVsc + 1IOcmVzb3VyY2VMYWJlbHMSjgEKFWNvbW1vbl9jb21wdXRlX2NvbmZpZxgCIAEoCzJALnNuYXBjaGF0LnJlc2VhcmNoLmdibWwuU + 2hhcmVkUmVzb3VyY2VDb25maWcuQ29tbW9uQ29tcHV0ZUNvbmZpZ0IY4j8VEhNjb21tb25Db21wdXRlQ29uZmlnUhNjb21tb25Db + 21wdXRlQ29uZmlnGpQFChNDb21tb25Db21wdXRlQ29uZmlnEiYKB3Byb2plY3QYASABKAlCDOI/CRIHcHJvamVjdFIHcHJvamVjd + BIjCgZyZWdpb24YAiABKAlCC+I/CBIGcmVnaW9uUgZyZWdpb24SQwoSdGVtcF9hc3NldHNfYnVja2V0GAMgASgJQhXiPxISEHRlb + XBBc3NldHNCdWNrZXRSEHRlbXBBc3NldHNCdWNrZXQSXAobdGVtcF9yZWdpb25hbF9hc3NldHNfYnVja2V0GAQgASgJQh3iPxoSG + HRlbXBSZWdpb25hbEFzc2V0c0J1Y2tldFIYdGVtcFJlZ2lvbmFsQXNzZXRzQnVja2V0EkMKEnBlcm1fYXNzZXRzX2J1Y2tldBgFI + AEoCUIV4j8SEhBwZXJtQXNzZXRzQnVja2V0UhBwZXJtQXNzZXRzQnVja2V0EloKG3RlbXBfYXNzZXRzX2JxX2RhdGFzZXRfbmFtZ + RgGIAEoCUIc4j8ZEhd0ZW1wQXNzZXRzQnFEYXRhc2V0TmFtZVIXdGVtcEFzc2V0c0JxRGF0YXNldE5hbWUSVgoZZW1iZWRkaW5nX + 2JxX2RhdGFzZXRfbmFtZRgHIAEoCUIb4j8YEhZlbWJlZGRpbmdCcURhdGFzZXROYW1lUhZlbWJlZGRpbmdCcURhdGFzZXROYW1lE + lYKGWdjcF9zZXJ2aWNlX2FjY291bnRfZW1haWwYCCABKAlCG+I/GBIWZ2NwU2VydmljZUFjY291bnRFbWFpbFIWZ2NwU2VydmljZ + UFjY291bnRFbWFpbBI8Cg9kYXRhZmxvd19ydW5uZXIYCyABKAlCE+I/EBIOZGF0YWZsb3dSdW5uZXJSDmRhdGFmbG93UnVubmVyG + lcKE1Jlc291cmNlTGFiZWxzRW50cnkSGgoDa2V5GAEgASgJQgjiPwUSA2tleVIDa2V5EiAKBXZhbHVlGAIgASgJQgriPwcSBXZhb + HVlUgV2YWx1ZToCOAEi9wgKEkdpZ2xSZXNvdXJjZUNvbmZpZxJbChpzaGFyZWRfcmVzb3VyY2VfY29uZmlnX3VyaRgBIAEoCUIc4 + j8ZEhdzaGFyZWRSZXNvdXJjZUNvbmZpZ1VyaUgAUhdzaGFyZWRSZXNvdXJjZUNvbmZpZ1VyaRJ/ChZzaGFyZWRfcmVzb3VyY2VfY + 29uZmlnGAIgASgLMiwuc25hcGNoYXQucmVzZWFyY2guZ2JtbC5TaGFyZWRSZXNvdXJjZUNvbmZpZ0IZ4j8WEhRzaGFyZWRSZXNvd + XJjZUNvbmZpZ0gAUhRzaGFyZWRSZXNvdXJjZUNvbmZpZxJ4ChNwcmVwcm9jZXNzb3JfY29uZmlnGAwgASgLMi4uc25hcGNoYXQuc + mVzZWFyY2guZ2JtbC5EYXRhUHJlcHJvY2Vzc29yQ29uZmlnQhfiPxQSEnByZXByb2Nlc3NvckNvbmZpZ1IScHJlcHJvY2Vzc29yQ + 29uZmlnEn8KF3N1YmdyYXBoX3NhbXBsZXJfY29uZmlnGA0gASgLMisuc25hcGNoYXQucmVzZWFyY2guZ2JtbC5TcGFya1Jlc291c + mNlQ29uZmlnQhriPxcSFXN1YmdyYXBoU2FtcGxlckNvbmZpZ1IVc3ViZ3JhcGhTYW1wbGVyQ29uZmlnEnwKFnNwbGl0X2dlbmVyY + XRvcl9jb25maWcYDiABKAsyKy5zbmFwY2hhdC5yZXNlYXJjaC5nYm1sLlNwYXJrUmVzb3VyY2VDb25maWdCGeI/FhIUc3BsaXRHZ + W5lcmF0b3JDb25maWdSFHNwbGl0R2VuZXJhdG9yQ29uZmlnEm0KDnRyYWluZXJfY29uZmlnGA8gASgLMjAuc25hcGNoYXQucmVzZ + WFyY2guZ2JtbC5EaXN0cmlidXRlZFRyYWluZXJDb25maWdCFBgB4j8PEg10cmFpbmVyQ29uZmlnUg10cmFpbmVyQ29uZmlnEnQKE + WluZmVyZW5jZXJfY29uZmlnGBAgASgLMi4uc25hcGNoYXQucmVzZWFyY2guZ2JtbC5EYXRhZmxvd1Jlc291cmNlQ29uZmlnQhcYA + eI/EhIQaW5mZXJlbmNlckNvbmZpZ1IQaW5mZXJlbmNlckNvbmZpZxKBAQoXdHJhaW5lcl9yZXNvdXJjZV9jb25maWcYESABKAsyL + S5zbmFwY2hhdC5yZXNlYXJjaC5nYm1sLlRyYWluZXJSZXNvdXJjZUNvbmZpZ0Ia4j8XEhV0cmFpbmVyUmVzb3VyY2VDb25maWdSF + XRyYWluZXJSZXNvdXJjZUNvbmZpZxKNAQoaaW5mZXJlbmNlcl9yZXNvdXJjZV9jb25maWcYEiABKAsyMC5zbmFwY2hhdC5yZXNlY + XJjaC5nYm1sLkluZmVyZW5jZXJSZXNvdXJjZUNvbmZpZ0Id4j8aEhhpbmZlcmVuY2VyUmVzb3VyY2VDb25maWdSGGluZmVyZW5jZ + XJSZXNvdXJjZUNvbmZpZ0IRCg9zaGFyZWRfcmVzb3VyY2Uq4wMKCUNvbXBvbmVudBItChFDb21wb25lbnRfVW5rbm93bhAAGhbiP + xMSEUNvbXBvbmVudF9Vbmtub3duEj8KGkNvbXBvbmVudF9Db25maWdfVmFsaWRhdG9yEAEaH+I/HBIaQ29tcG9uZW50X0NvbmZpZ + 19WYWxpZGF0b3ISPwoaQ29tcG9uZW50X0NvbmZpZ19Qb3B1bGF0b3IQAhof4j8cEhpDb21wb25lbnRfQ29uZmlnX1BvcHVsYXRvc + hJBChtDb21wb25lbnRfRGF0YV9QcmVwcm9jZXNzb3IQAxog4j8dEhtDb21wb25lbnRfRGF0YV9QcmVwcm9jZXNzb3ISPwoaQ29tc + G9uZW50X1N1YmdyYXBoX1NhbXBsZXIQBBof4j8cEhpDb21wb25lbnRfU3ViZ3JhcGhfU2FtcGxlchI9ChlDb21wb25lbnRfU3Bsa + XRfR2VuZXJhdG9yEAUaHuI/GxIZQ29tcG9uZW50X1NwbGl0X0dlbmVyYXRvchItChFDb21wb25lbnRfVHJhaW5lchAGGhbiPxMSE + UNvbXBvbmVudF9UcmFpbmVyEjMKFENvbXBvbmVudF9JbmZlcmVuY2VyEAcaGeI/FhIUQ29tcG9uZW50X0luZmVyZW5jZXJiBnByb + 3RvMw==""" ).mkString) lazy val scalaDescriptor: _root_.scalapb.descriptors.FileDescriptor = { val scalaProto = com.google.protobuf.descriptor.FileDescriptorProto.parseFrom(ProtoBytes) diff --git a/scala_spark35/common/src/main/scala/snapchat/research/gbml/gigl_resource_config/VertexAiResourceConfig.scala b/scala_spark35/common/src/main/scala/snapchat/research/gbml/gigl_resource_config/VertexAiResourceConfig.scala index 21f9ea1c2..d2394a65e 100644 --- a/scala_spark35/common/src/main/scala/snapchat/research/gbml/gigl_resource_config/VertexAiResourceConfig.scala +++ b/scala_spark35/common/src/main/scala/snapchat/research/gbml/gigl_resource_config/VertexAiResourceConfig.scala @@ -36,6 +36,9 @@ package snapchat.research.gbml.gigl_resource_config * @param reservationAffinity * Compute Engine reservation affinity for the job. * See https://docs.cloud.google.com/vertex-ai/docs/training/use-reservations + * @param tensorboardResourceName + * Existing Vertex AI TensorBoard resource to attach to the job. + * Format: projects/{project}/locations/{region}/tensorboards/{tensorboard_id} */ @SerialVersionUID(0L) final case class VertexAiResourceConfig( @@ -48,6 +51,7 @@ final case class VertexAiResourceConfig( schedulingStrategy: _root_.scala.Predef.String = "", bootDiskSizeGb: _root_.scala.Int = 0, reservationAffinity: _root_.scala.Option[snapchat.research.gbml.gigl_resource_config.VertexAiReservationAffinity] = _root_.scala.None, + tensorboardResourceName: _root_.scala.Predef.String = "", unknownFields: _root_.scalapb.UnknownFieldSet = _root_.scalapb.UnknownFieldSet.empty ) extends scalapb.GeneratedMessage with scalapb.lenses.Updatable[VertexAiResourceConfig] { @transient @@ -114,6 +118,13 @@ final case class VertexAiResourceConfig( val __value = reservationAffinity.get __size += 1 + _root_.com.google.protobuf.CodedOutputStream.computeUInt32SizeNoTag(__value.serializedSize) + __value.serializedSize }; + + { + val __value = tensorboardResourceName + if (!__value.isEmpty) { + __size += _root_.com.google.protobuf.CodedOutputStream.computeStringSize(10, __value) + } + }; __size += unknownFields.serializedSize __size } @@ -181,6 +192,12 @@ final case class VertexAiResourceConfig( _output__.writeUInt32NoTag(__m.serializedSize) __m.writeTo(_output__) }; + { + val __v = tensorboardResourceName + if (!__v.isEmpty) { + _output__.writeString(10, __v) + } + }; unknownFields.writeTo(_output__) } def withMachineType(__v: _root_.scala.Predef.String): VertexAiResourceConfig = copy(machineType = __v) @@ -194,6 +211,7 @@ final case class VertexAiResourceConfig( def getReservationAffinity: snapchat.research.gbml.gigl_resource_config.VertexAiReservationAffinity = reservationAffinity.getOrElse(snapchat.research.gbml.gigl_resource_config.VertexAiReservationAffinity.defaultInstance) def clearReservationAffinity: VertexAiResourceConfig = copy(reservationAffinity = _root_.scala.None) def withReservationAffinity(__v: snapchat.research.gbml.gigl_resource_config.VertexAiReservationAffinity): VertexAiResourceConfig = copy(reservationAffinity = Option(__v)) + def withTensorboardResourceName(__v: _root_.scala.Predef.String): VertexAiResourceConfig = copy(tensorboardResourceName = __v) def withUnknownFields(__v: _root_.scalapb.UnknownFieldSet) = copy(unknownFields = __v) def discardUnknownFields = copy(unknownFields = _root_.scalapb.UnknownFieldSet.empty) def getFieldByNumber(__fieldNumber: _root_.scala.Int): _root_.scala.Any = { @@ -231,6 +249,10 @@ final case class VertexAiResourceConfig( if (__t != 0) __t else null } case 9 => reservationAffinity.orNull + case 10 => { + val __t = tensorboardResourceName + if (__t != "") __t else null + } } } def getField(__field: _root_.scalapb.descriptors.FieldDescriptor): _root_.scalapb.descriptors.PValue = { @@ -245,6 +267,7 @@ final case class VertexAiResourceConfig( case 7 => _root_.scalapb.descriptors.PString(schedulingStrategy) case 8 => _root_.scalapb.descriptors.PInt(bootDiskSizeGb) case 9 => reservationAffinity.map(_.toPMessage).getOrElse(_root_.scalapb.descriptors.PEmpty) + case 10 => _root_.scalapb.descriptors.PString(tensorboardResourceName) } } def toProtoString: _root_.scala.Predef.String = _root_.scalapb.TextFormat.printToUnicodeString(this) @@ -264,6 +287,7 @@ object VertexAiResourceConfig extends scalapb.GeneratedMessageCompanion[snapchat var __schedulingStrategy: _root_.scala.Predef.String = "" var __bootDiskSizeGb: _root_.scala.Int = 0 var __reservationAffinity: _root_.scala.Option[snapchat.research.gbml.gigl_resource_config.VertexAiReservationAffinity] = _root_.scala.None + var __tensorboardResourceName: _root_.scala.Predef.String = "" var `_unknownFields__`: _root_.scalapb.UnknownFieldSet.Builder = null var _done__ = false while (!_done__) { @@ -288,6 +312,8 @@ object VertexAiResourceConfig extends scalapb.GeneratedMessageCompanion[snapchat __bootDiskSizeGb = _input__.readUInt32() case 74 => __reservationAffinity = Option(__reservationAffinity.fold(_root_.scalapb.LiteParser.readMessage[snapchat.research.gbml.gigl_resource_config.VertexAiReservationAffinity](_input__))(_root_.scalapb.LiteParser.readMessage(_input__, _))) + case 82 => + __tensorboardResourceName = _input__.readStringRequireUtf8() case tag => if (_unknownFields__ == null) { _unknownFields__ = new _root_.scalapb.UnknownFieldSet.Builder() @@ -305,6 +331,7 @@ object VertexAiResourceConfig extends scalapb.GeneratedMessageCompanion[snapchat schedulingStrategy = __schedulingStrategy, bootDiskSizeGb = __bootDiskSizeGb, reservationAffinity = __reservationAffinity, + tensorboardResourceName = __tensorboardResourceName, unknownFields = if (_unknownFields__ == null) _root_.scalapb.UnknownFieldSet.empty else _unknownFields__.result() ) } @@ -320,7 +347,8 @@ object VertexAiResourceConfig extends scalapb.GeneratedMessageCompanion[snapchat gcpRegionOverride = __fieldsMap.get(scalaDescriptor.findFieldByNumber(6).get).map(_.as[_root_.scala.Predef.String]).getOrElse(""), schedulingStrategy = __fieldsMap.get(scalaDescriptor.findFieldByNumber(7).get).map(_.as[_root_.scala.Predef.String]).getOrElse(""), bootDiskSizeGb = __fieldsMap.get(scalaDescriptor.findFieldByNumber(8).get).map(_.as[_root_.scala.Int]).getOrElse(0), - reservationAffinity = __fieldsMap.get(scalaDescriptor.findFieldByNumber(9).get).flatMap(_.as[_root_.scala.Option[snapchat.research.gbml.gigl_resource_config.VertexAiReservationAffinity]]) + reservationAffinity = __fieldsMap.get(scalaDescriptor.findFieldByNumber(9).get).flatMap(_.as[_root_.scala.Option[snapchat.research.gbml.gigl_resource_config.VertexAiReservationAffinity]]), + tensorboardResourceName = __fieldsMap.get(scalaDescriptor.findFieldByNumber(10).get).map(_.as[_root_.scala.Predef.String]).getOrElse("") ) case _ => throw new RuntimeException("Expected PMessage") } @@ -344,7 +372,8 @@ object VertexAiResourceConfig extends scalapb.GeneratedMessageCompanion[snapchat gcpRegionOverride = "", schedulingStrategy = "", bootDiskSizeGb = 0, - reservationAffinity = _root_.scala.None + reservationAffinity = _root_.scala.None, + tensorboardResourceName = "" ) implicit class VertexAiResourceConfigLens[UpperPB](_l: _root_.scalapb.lenses.Lens[UpperPB, snapchat.research.gbml.gigl_resource_config.VertexAiResourceConfig]) extends _root_.scalapb.lenses.ObjectLens[UpperPB, snapchat.research.gbml.gigl_resource_config.VertexAiResourceConfig](_l) { def machineType: _root_.scalapb.lenses.Lens[UpperPB, _root_.scala.Predef.String] = field(_.machineType)((c_, f_) => c_.copy(machineType = f_)) @@ -357,6 +386,7 @@ object VertexAiResourceConfig extends scalapb.GeneratedMessageCompanion[snapchat def bootDiskSizeGb: _root_.scalapb.lenses.Lens[UpperPB, _root_.scala.Int] = field(_.bootDiskSizeGb)((c_, f_) => c_.copy(bootDiskSizeGb = f_)) def reservationAffinity: _root_.scalapb.lenses.Lens[UpperPB, snapchat.research.gbml.gigl_resource_config.VertexAiReservationAffinity] = field(_.getReservationAffinity)((c_, f_) => c_.copy(reservationAffinity = Option(f_))) def optionalReservationAffinity: _root_.scalapb.lenses.Lens[UpperPB, _root_.scala.Option[snapchat.research.gbml.gigl_resource_config.VertexAiReservationAffinity]] = field(_.reservationAffinity)((c_, f_) => c_.copy(reservationAffinity = f_)) + def tensorboardResourceName: _root_.scalapb.lenses.Lens[UpperPB, _root_.scala.Predef.String] = field(_.tensorboardResourceName)((c_, f_) => c_.copy(tensorboardResourceName = f_)) } final val MACHINE_TYPE_FIELD_NUMBER = 1 final val GPU_TYPE_FIELD_NUMBER = 2 @@ -367,6 +397,7 @@ object VertexAiResourceConfig extends scalapb.GeneratedMessageCompanion[snapchat final val SCHEDULING_STRATEGY_FIELD_NUMBER = 7 final val BOOT_DISK_SIZE_GB_FIELD_NUMBER = 8 final val RESERVATION_AFFINITY_FIELD_NUMBER = 9 + final val TENSORBOARD_RESOURCE_NAME_FIELD_NUMBER = 10 def of( machineType: _root_.scala.Predef.String, gpuType: _root_.scala.Predef.String, @@ -376,7 +407,8 @@ object VertexAiResourceConfig extends scalapb.GeneratedMessageCompanion[snapchat gcpRegionOverride: _root_.scala.Predef.String, schedulingStrategy: _root_.scala.Predef.String, bootDiskSizeGb: _root_.scala.Int, - reservationAffinity: _root_.scala.Option[snapchat.research.gbml.gigl_resource_config.VertexAiReservationAffinity] + reservationAffinity: _root_.scala.Option[snapchat.research.gbml.gigl_resource_config.VertexAiReservationAffinity], + tensorboardResourceName: _root_.scala.Predef.String ): _root_.snapchat.research.gbml.gigl_resource_config.VertexAiResourceConfig = _root_.snapchat.research.gbml.gigl_resource_config.VertexAiResourceConfig( machineType, gpuType, @@ -386,7 +418,8 @@ object VertexAiResourceConfig extends scalapb.GeneratedMessageCompanion[snapchat gcpRegionOverride, schedulingStrategy, bootDiskSizeGb, - reservationAffinity + reservationAffinity, + tensorboardResourceName ) // @@protoc_insertion_point(GeneratedMessageCompanion[snapchat.research.gbml.VertexAiResourceConfig]) } diff --git a/snapchat/research/gbml/gigl_resource_config_pb2.py b/snapchat/research/gbml/gigl_resource_config_pb2.py index bbda8cf57..cf55764c4 100644 --- a/snapchat/research/gbml/gigl_resource_config_pb2.py +++ b/snapchat/research/gbml/gigl_resource_config_pb2.py @@ -15,7 +15,7 @@ -DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n1snapchat/research/gbml/gigl_resource_config.proto\x12\x16snapchat.research.gbml\"Y\n\x13SparkResourceConfig\x12\x14\n\x0cmachine_type\x18\x01 \x01(\t\x12\x16\n\x0enum_local_ssds\x18\x02 \x01(\r\x12\x14\n\x0cnum_replicas\x18\x03 \x01(\r\"\x83\x01\n\x16\x44\x61taflowResourceConfig\x12\x13\n\x0bnum_workers\x18\x01 \x01(\r\x12\x17\n\x0fmax_num_workers\x18\x02 \x01(\r\x12\x14\n\x0cmachine_type\x18\x03 \x01(\t\x12\x14\n\x0c\x64isk_size_gb\x18\x04 \x01(\r\x12\x0f\n\x07timeout\x18\x05 \x01(\r\"\xbc\x01\n\x16\x44\x61taPreprocessorConfig\x12P\n\x18\x65\x64ge_preprocessor_config\x18\x01 \x01(\x0b\x32..snapchat.research.gbml.DataflowResourceConfig\x12P\n\x18node_preprocessor_config\x18\x02 \x01(\x0b\x32..snapchat.research.gbml.DataflowResourceConfig\"h\n\x15VertexAiTrainerConfig\x12\x14\n\x0cmachine_type\x18\x01 \x01(\t\x12\x10\n\x08gpu_type\x18\x02 \x01(\t\x12\x11\n\tgpu_limit\x18\x03 \x01(\r\x12\x14\n\x0cnum_replicas\x18\x04 \x01(\r\"z\n\x10KFPTrainerConfig\x12\x13\n\x0b\x63pu_request\x18\x01 \x01(\t\x12\x16\n\x0ememory_request\x18\x02 \x01(\t\x12\x10\n\x08gpu_type\x18\x03 \x01(\t\x12\x11\n\tgpu_limit\x18\x04 \x01(\r\x12\x14\n\x0cnum_replicas\x18\x05 \x01(\r\")\n\x12LocalTrainerConfig\x12\x13\n\x0bnum_workers\x18\x01 \x01(\r\"O\n\x1bVertexAiReservationAffinity\x12\x0c\n\x04type\x18\x01 \x01(\t\x12\"\n\x1areservation_resource_names\x18\x02 \x03(\t\"\xa2\x02\n\x16VertexAiResourceConfig\x12\x14\n\x0cmachine_type\x18\x01 \x01(\t\x12\x10\n\x08gpu_type\x18\x02 \x01(\t\x12\x11\n\tgpu_limit\x18\x03 \x01(\r\x12\x14\n\x0cnum_replicas\x18\x04 \x01(\r\x12\x0f\n\x07timeout\x18\x05 \x01(\r\x12\x1b\n\x13gcp_region_override\x18\x06 \x01(\t\x12\x1b\n\x13scheduling_strategy\x18\x07 \x01(\t\x12\x19\n\x11\x62oot_disk_size_gb\x18\x08 \x01(\r\x12Q\n\x14reservation_affinity\x18\t \x01(\x0b\x32\x33.snapchat.research.gbml.VertexAiReservationAffinity\"{\n\x11KFPResourceConfig\x12\x13\n\x0b\x63pu_request\x18\x01 \x01(\t\x12\x16\n\x0ememory_request\x18\x02 \x01(\t\x12\x10\n\x08gpu_type\x18\x03 \x01(\t\x12\x11\n\tgpu_limit\x18\x04 \x01(\r\x12\x14\n\x0cnum_replicas\x18\x05 \x01(\r\"*\n\x13LocalResourceConfig\x12\x13\n\x0bnum_workers\x18\x01 \x01(\r\"\xd4\x01\n\x18VertexAiGraphStoreConfig\x12H\n\x10graph_store_pool\x18\x01 \x01(\x0b\x32..snapchat.research.gbml.VertexAiResourceConfig\x12\x44\n\x0c\x63ompute_pool\x18\x02 \x01(\x0b\x32..snapchat.research.gbml.VertexAiResourceConfig\x12(\n compute_cluster_local_world_size\x18\x03 \x01(\x05\"\x93\x02\n\x18\x44istributedTrainerConfig\x12Q\n\x18vertex_ai_trainer_config\x18\x01 \x01(\x0b\x32-.snapchat.research.gbml.VertexAiTrainerConfigH\x00\x12\x46\n\x12kfp_trainer_config\x18\x02 \x01(\x0b\x32(.snapchat.research.gbml.KFPTrainerConfigH\x00\x12J\n\x14local_trainer_config\x18\x03 \x01(\x0b\x32*.snapchat.research.gbml.LocalTrainerConfigH\x00\x42\x10\n\x0etrainer_config\"\xf5\x02\n\x15TrainerResourceConfig\x12R\n\x18vertex_ai_trainer_config\x18\x01 \x01(\x0b\x32..snapchat.research.gbml.VertexAiResourceConfigH\x00\x12G\n\x12kfp_trainer_config\x18\x02 \x01(\x0b\x32).snapchat.research.gbml.KFPResourceConfigH\x00\x12K\n\x14local_trainer_config\x18\x03 \x01(\x0b\x32+.snapchat.research.gbml.LocalResourceConfigH\x00\x12`\n$vertex_ai_graph_store_trainer_config\x18\x04 \x01(\x0b\x32\x30.snapchat.research.gbml.VertexAiGraphStoreConfigH\x00\x42\x10\n\x0etrainer_config\"\x91\x03\n\x18InferencerResourceConfig\x12U\n\x1bvertex_ai_inferencer_config\x18\x01 \x01(\x0b\x32..snapchat.research.gbml.VertexAiResourceConfigH\x00\x12T\n\x1a\x64\x61taflow_inferencer_config\x18\x02 \x01(\x0b\x32..snapchat.research.gbml.DataflowResourceConfigH\x00\x12N\n\x17local_inferencer_config\x18\x03 \x01(\x0b\x32+.snapchat.research.gbml.LocalResourceConfigH\x00\x12\x63\n\'vertex_ai_graph_store_inferencer_config\x18\x04 \x01(\x0b\x32\x30.snapchat.research.gbml.VertexAiGraphStoreConfigH\x00\x42\x13\n\x11inferencer_config\"\xa3\x04\n\x14SharedResourceConfig\x12Y\n\x0fresource_labels\x18\x01 \x03(\x0b\x32@.snapchat.research.gbml.SharedResourceConfig.ResourceLabelsEntry\x12_\n\x15\x63ommon_compute_config\x18\x02 \x01(\x0b\x32@.snapchat.research.gbml.SharedResourceConfig.CommonComputeConfig\x1a\x97\x02\n\x13\x43ommonComputeConfig\x12\x0f\n\x07project\x18\x01 \x01(\t\x12\x0e\n\x06region\x18\x02 \x01(\t\x12\x1a\n\x12temp_assets_bucket\x18\x03 \x01(\t\x12#\n\x1btemp_regional_assets_bucket\x18\x04 \x01(\t\x12\x1a\n\x12perm_assets_bucket\x18\x05 \x01(\t\x12#\n\x1btemp_assets_bq_dataset_name\x18\x06 \x01(\t\x12!\n\x19\x65mbedding_bq_dataset_name\x18\x07 \x01(\t\x12!\n\x19gcp_service_account_email\x18\x08 \x01(\t\x12\x17\n\x0f\x64\x61taflow_runner\x18\x0b \x01(\t\x1a\x35\n\x13ResourceLabelsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\"\xc8\x05\n\x12GiglResourceConfig\x12$\n\x1ashared_resource_config_uri\x18\x01 \x01(\tH\x00\x12N\n\x16shared_resource_config\x18\x02 \x01(\x0b\x32,.snapchat.research.gbml.SharedResourceConfigH\x00\x12K\n\x13preprocessor_config\x18\x0c \x01(\x0b\x32..snapchat.research.gbml.DataPreprocessorConfig\x12L\n\x17subgraph_sampler_config\x18\r \x01(\x0b\x32+.snapchat.research.gbml.SparkResourceConfig\x12K\n\x16split_generator_config\x18\x0e \x01(\x0b\x32+.snapchat.research.gbml.SparkResourceConfig\x12L\n\x0etrainer_config\x18\x0f \x01(\x0b\x32\x30.snapchat.research.gbml.DistributedTrainerConfigB\x02\x18\x01\x12M\n\x11inferencer_config\x18\x10 \x01(\x0b\x32..snapchat.research.gbml.DataflowResourceConfigB\x02\x18\x01\x12N\n\x17trainer_resource_config\x18\x11 \x01(\x0b\x32-.snapchat.research.gbml.TrainerResourceConfig\x12T\n\x1ainferencer_resource_config\x18\x12 \x01(\x0b\x32\x30.snapchat.research.gbml.InferencerResourceConfigB\x11\n\x0fshared_resource*\xf3\x01\n\tComponent\x12\x15\n\x11\x43omponent_Unknown\x10\x00\x12\x1e\n\x1a\x43omponent_Config_Validator\x10\x01\x12\x1e\n\x1a\x43omponent_Config_Populator\x10\x02\x12\x1f\n\x1b\x43omponent_Data_Preprocessor\x10\x03\x12\x1e\n\x1a\x43omponent_Subgraph_Sampler\x10\x04\x12\x1d\n\x19\x43omponent_Split_Generator\x10\x05\x12\x15\n\x11\x43omponent_Trainer\x10\x06\x12\x18\n\x14\x43omponent_Inferencer\x10\x07\x62\x06proto3') +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n1snapchat/research/gbml/gigl_resource_config.proto\x12\x16snapchat.research.gbml\"Y\n\x13SparkResourceConfig\x12\x14\n\x0cmachine_type\x18\x01 \x01(\t\x12\x16\n\x0enum_local_ssds\x18\x02 \x01(\r\x12\x14\n\x0cnum_replicas\x18\x03 \x01(\r\"\x83\x01\n\x16\x44\x61taflowResourceConfig\x12\x13\n\x0bnum_workers\x18\x01 \x01(\r\x12\x17\n\x0fmax_num_workers\x18\x02 \x01(\r\x12\x14\n\x0cmachine_type\x18\x03 \x01(\t\x12\x14\n\x0c\x64isk_size_gb\x18\x04 \x01(\r\x12\x0f\n\x07timeout\x18\x05 \x01(\r\"\xbc\x01\n\x16\x44\x61taPreprocessorConfig\x12P\n\x18\x65\x64ge_preprocessor_config\x18\x01 \x01(\x0b\x32..snapchat.research.gbml.DataflowResourceConfig\x12P\n\x18node_preprocessor_config\x18\x02 \x01(\x0b\x32..snapchat.research.gbml.DataflowResourceConfig\"h\n\x15VertexAiTrainerConfig\x12\x14\n\x0cmachine_type\x18\x01 \x01(\t\x12\x10\n\x08gpu_type\x18\x02 \x01(\t\x12\x11\n\tgpu_limit\x18\x03 \x01(\r\x12\x14\n\x0cnum_replicas\x18\x04 \x01(\r\"z\n\x10KFPTrainerConfig\x12\x13\n\x0b\x63pu_request\x18\x01 \x01(\t\x12\x16\n\x0ememory_request\x18\x02 \x01(\t\x12\x10\n\x08gpu_type\x18\x03 \x01(\t\x12\x11\n\tgpu_limit\x18\x04 \x01(\r\x12\x14\n\x0cnum_replicas\x18\x05 \x01(\r\")\n\x12LocalTrainerConfig\x12\x13\n\x0bnum_workers\x18\x01 \x01(\r\"O\n\x1bVertexAiReservationAffinity\x12\x0c\n\x04type\x18\x01 \x01(\t\x12\"\n\x1areservation_resource_names\x18\x02 \x03(\t\"\xc5\x02\n\x16VertexAiResourceConfig\x12\x14\n\x0cmachine_type\x18\x01 \x01(\t\x12\x10\n\x08gpu_type\x18\x02 \x01(\t\x12\x11\n\tgpu_limit\x18\x03 \x01(\r\x12\x14\n\x0cnum_replicas\x18\x04 \x01(\r\x12\x0f\n\x07timeout\x18\x05 \x01(\r\x12\x1b\n\x13gcp_region_override\x18\x06 \x01(\t\x12\x1b\n\x13scheduling_strategy\x18\x07 \x01(\t\x12\x19\n\x11\x62oot_disk_size_gb\x18\x08 \x01(\r\x12Q\n\x14reservation_affinity\x18\t \x01(\x0b\x32\x33.snapchat.research.gbml.VertexAiReservationAffinity\x12!\n\x19tensorboard_resource_name\x18\n \x01(\t\"{\n\x11KFPResourceConfig\x12\x13\n\x0b\x63pu_request\x18\x01 \x01(\t\x12\x16\n\x0ememory_request\x18\x02 \x01(\t\x12\x10\n\x08gpu_type\x18\x03 \x01(\t\x12\x11\n\tgpu_limit\x18\x04 \x01(\r\x12\x14\n\x0cnum_replicas\x18\x05 \x01(\r\"*\n\x13LocalResourceConfig\x12\x13\n\x0bnum_workers\x18\x01 \x01(\r\"\xd4\x01\n\x18VertexAiGraphStoreConfig\x12H\n\x10graph_store_pool\x18\x01 \x01(\x0b\x32..snapchat.research.gbml.VertexAiResourceConfig\x12\x44\n\x0c\x63ompute_pool\x18\x02 \x01(\x0b\x32..snapchat.research.gbml.VertexAiResourceConfig\x12(\n compute_cluster_local_world_size\x18\x03 \x01(\x05\"\x93\x02\n\x18\x44istributedTrainerConfig\x12Q\n\x18vertex_ai_trainer_config\x18\x01 \x01(\x0b\x32-.snapchat.research.gbml.VertexAiTrainerConfigH\x00\x12\x46\n\x12kfp_trainer_config\x18\x02 \x01(\x0b\x32(.snapchat.research.gbml.KFPTrainerConfigH\x00\x12J\n\x14local_trainer_config\x18\x03 \x01(\x0b\x32*.snapchat.research.gbml.LocalTrainerConfigH\x00\x42\x10\n\x0etrainer_config\"\xf5\x02\n\x15TrainerResourceConfig\x12R\n\x18vertex_ai_trainer_config\x18\x01 \x01(\x0b\x32..snapchat.research.gbml.VertexAiResourceConfigH\x00\x12G\n\x12kfp_trainer_config\x18\x02 \x01(\x0b\x32).snapchat.research.gbml.KFPResourceConfigH\x00\x12K\n\x14local_trainer_config\x18\x03 \x01(\x0b\x32+.snapchat.research.gbml.LocalResourceConfigH\x00\x12`\n$vertex_ai_graph_store_trainer_config\x18\x04 \x01(\x0b\x32\x30.snapchat.research.gbml.VertexAiGraphStoreConfigH\x00\x42\x10\n\x0etrainer_config\"\x91\x03\n\x18InferencerResourceConfig\x12U\n\x1bvertex_ai_inferencer_config\x18\x01 \x01(\x0b\x32..snapchat.research.gbml.VertexAiResourceConfigH\x00\x12T\n\x1a\x64\x61taflow_inferencer_config\x18\x02 \x01(\x0b\x32..snapchat.research.gbml.DataflowResourceConfigH\x00\x12N\n\x17local_inferencer_config\x18\x03 \x01(\x0b\x32+.snapchat.research.gbml.LocalResourceConfigH\x00\x12\x63\n\'vertex_ai_graph_store_inferencer_config\x18\x04 \x01(\x0b\x32\x30.snapchat.research.gbml.VertexAiGraphStoreConfigH\x00\x42\x13\n\x11inferencer_config\"\xa3\x04\n\x14SharedResourceConfig\x12Y\n\x0fresource_labels\x18\x01 \x03(\x0b\x32@.snapchat.research.gbml.SharedResourceConfig.ResourceLabelsEntry\x12_\n\x15\x63ommon_compute_config\x18\x02 \x01(\x0b\x32@.snapchat.research.gbml.SharedResourceConfig.CommonComputeConfig\x1a\x97\x02\n\x13\x43ommonComputeConfig\x12\x0f\n\x07project\x18\x01 \x01(\t\x12\x0e\n\x06region\x18\x02 \x01(\t\x12\x1a\n\x12temp_assets_bucket\x18\x03 \x01(\t\x12#\n\x1btemp_regional_assets_bucket\x18\x04 \x01(\t\x12\x1a\n\x12perm_assets_bucket\x18\x05 \x01(\t\x12#\n\x1btemp_assets_bq_dataset_name\x18\x06 \x01(\t\x12!\n\x19\x65mbedding_bq_dataset_name\x18\x07 \x01(\t\x12!\n\x19gcp_service_account_email\x18\x08 \x01(\t\x12\x17\n\x0f\x64\x61taflow_runner\x18\x0b \x01(\t\x1a\x35\n\x13ResourceLabelsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\"\xc8\x05\n\x12GiglResourceConfig\x12$\n\x1ashared_resource_config_uri\x18\x01 \x01(\tH\x00\x12N\n\x16shared_resource_config\x18\x02 \x01(\x0b\x32,.snapchat.research.gbml.SharedResourceConfigH\x00\x12K\n\x13preprocessor_config\x18\x0c \x01(\x0b\x32..snapchat.research.gbml.DataPreprocessorConfig\x12L\n\x17subgraph_sampler_config\x18\r \x01(\x0b\x32+.snapchat.research.gbml.SparkResourceConfig\x12K\n\x16split_generator_config\x18\x0e \x01(\x0b\x32+.snapchat.research.gbml.SparkResourceConfig\x12L\n\x0etrainer_config\x18\x0f \x01(\x0b\x32\x30.snapchat.research.gbml.DistributedTrainerConfigB\x02\x18\x01\x12M\n\x11inferencer_config\x18\x10 \x01(\x0b\x32..snapchat.research.gbml.DataflowResourceConfigB\x02\x18\x01\x12N\n\x17trainer_resource_config\x18\x11 \x01(\x0b\x32-.snapchat.research.gbml.TrainerResourceConfig\x12T\n\x1ainferencer_resource_config\x18\x12 \x01(\x0b\x32\x30.snapchat.research.gbml.InferencerResourceConfigB\x11\n\x0fshared_resource*\xf3\x01\n\tComponent\x12\x15\n\x11\x43omponent_Unknown\x10\x00\x12\x1e\n\x1a\x43omponent_Config_Validator\x10\x01\x12\x1e\n\x1a\x43omponent_Config_Populator\x10\x02\x12\x1f\n\x1b\x43omponent_Data_Preprocessor\x10\x03\x12\x1e\n\x1a\x43omponent_Subgraph_Sampler\x10\x04\x12\x1d\n\x19\x43omponent_Split_Generator\x10\x05\x12\x15\n\x11\x43omponent_Trainer\x10\x06\x12\x18\n\x14\x43omponent_Inferencer\x10\x07\x62\x06proto3') _COMPONENT = DESCRIPTOR.enum_types_by_name['Component'] Component = enum_type_wrapper.EnumTypeWrapper(_COMPONENT) @@ -184,8 +184,8 @@ _GIGLRESOURCECONFIG.fields_by_name['trainer_config']._serialized_options = b'\030\001' _GIGLRESOURCECONFIG.fields_by_name['inferencer_config']._options = None _GIGLRESOURCECONFIG.fields_by_name['inferencer_config']._serialized_options = b'\030\001' - _COMPONENT._serialized_start=3848 - _COMPONENT._serialized_end=4091 + _COMPONENT._serialized_start=3883 + _COMPONENT._serialized_end=4126 _SPARKRESOURCECONFIG._serialized_start=77 _SPARKRESOURCECONFIG._serialized_end=166 _DATAFLOWRESOURCECONFIG._serialized_start=169 @@ -201,25 +201,25 @@ _VERTEXAIRESERVATIONAFFINITY._serialized_start=766 _VERTEXAIRESERVATIONAFFINITY._serialized_end=845 _VERTEXAIRESOURCECONFIG._serialized_start=848 - _VERTEXAIRESOURCECONFIG._serialized_end=1138 - _KFPRESOURCECONFIG._serialized_start=1140 - _KFPRESOURCECONFIG._serialized_end=1263 - _LOCALRESOURCECONFIG._serialized_start=1265 - _LOCALRESOURCECONFIG._serialized_end=1307 - _VERTEXAIGRAPHSTORECONFIG._serialized_start=1310 - _VERTEXAIGRAPHSTORECONFIG._serialized_end=1522 - _DISTRIBUTEDTRAINERCONFIG._serialized_start=1525 - _DISTRIBUTEDTRAINERCONFIG._serialized_end=1800 - _TRAINERRESOURCECONFIG._serialized_start=1803 - _TRAINERRESOURCECONFIG._serialized_end=2176 - _INFERENCERRESOURCECONFIG._serialized_start=2179 - _INFERENCERRESOURCECONFIG._serialized_end=2580 - _SHAREDRESOURCECONFIG._serialized_start=2583 - _SHAREDRESOURCECONFIG._serialized_end=3130 - _SHAREDRESOURCECONFIG_COMMONCOMPUTECONFIG._serialized_start=2796 - _SHAREDRESOURCECONFIG_COMMONCOMPUTECONFIG._serialized_end=3075 - _SHAREDRESOURCECONFIG_RESOURCELABELSENTRY._serialized_start=3077 - _SHAREDRESOURCECONFIG_RESOURCELABELSENTRY._serialized_end=3130 - _GIGLRESOURCECONFIG._serialized_start=3133 - _GIGLRESOURCECONFIG._serialized_end=3845 + _VERTEXAIRESOURCECONFIG._serialized_end=1173 + _KFPRESOURCECONFIG._serialized_start=1175 + _KFPRESOURCECONFIG._serialized_end=1298 + _LOCALRESOURCECONFIG._serialized_start=1300 + _LOCALRESOURCECONFIG._serialized_end=1342 + _VERTEXAIGRAPHSTORECONFIG._serialized_start=1345 + _VERTEXAIGRAPHSTORECONFIG._serialized_end=1557 + _DISTRIBUTEDTRAINERCONFIG._serialized_start=1560 + _DISTRIBUTEDTRAINERCONFIG._serialized_end=1835 + _TRAINERRESOURCECONFIG._serialized_start=1838 + _TRAINERRESOURCECONFIG._serialized_end=2211 + _INFERENCERRESOURCECONFIG._serialized_start=2214 + _INFERENCERRESOURCECONFIG._serialized_end=2615 + _SHAREDRESOURCECONFIG._serialized_start=2618 + _SHAREDRESOURCECONFIG._serialized_end=3165 + _SHAREDRESOURCECONFIG_COMMONCOMPUTECONFIG._serialized_start=2831 + _SHAREDRESOURCECONFIG_COMMONCOMPUTECONFIG._serialized_end=3110 + _SHAREDRESOURCECONFIG_RESOURCELABELSENTRY._serialized_start=3112 + _SHAREDRESOURCECONFIG_RESOURCELABELSENTRY._serialized_end=3165 + _GIGLRESOURCECONFIG._serialized_start=3168 + _GIGLRESOURCECONFIG._serialized_end=3880 # @@protoc_insertion_point(module_scope) diff --git a/snapchat/research/gbml/gigl_resource_config_pb2.pyi b/snapchat/research/gbml/gigl_resource_config_pb2.pyi index 6198d1076..8522294a9 100644 --- a/snapchat/research/gbml/gigl_resource_config_pb2.pyi +++ b/snapchat/research/gbml/gigl_resource_config_pb2.pyi @@ -259,6 +259,7 @@ class VertexAiResourceConfig(google.protobuf.message.Message): SCHEDULING_STRATEGY_FIELD_NUMBER: builtins.int BOOT_DISK_SIZE_GB_FIELD_NUMBER: builtins.int RESERVATION_AFFINITY_FIELD_NUMBER: builtins.int + TENSORBOARD_RESOURCE_NAME_FIELD_NUMBER: builtins.int machine_type: builtins.str """Machine type for job""" gpu_type: builtins.str @@ -294,6 +295,10 @@ class VertexAiResourceConfig(google.protobuf.message.Message): """Compute Engine reservation affinity for the job. See https://docs.cloud.google.com/vertex-ai/docs/training/use-reservations """ + tensorboard_resource_name: builtins.str + """Existing Vertex AI TensorBoard resource to attach to the job. + Format: projects/{project}/locations/{region}/tensorboards/{tensorboard_id} + """ def __init__( self, *, @@ -306,9 +311,10 @@ class VertexAiResourceConfig(google.protobuf.message.Message): scheduling_strategy: builtins.str = ..., boot_disk_size_gb: builtins.int = ..., reservation_affinity: global___VertexAiReservationAffinity | None = ..., + tensorboard_resource_name: builtins.str = ..., ) -> None: ... def HasField(self, field_name: typing_extensions.Literal["reservation_affinity", b"reservation_affinity"]) -> builtins.bool: ... - def ClearField(self, field_name: typing_extensions.Literal["boot_disk_size_gb", b"boot_disk_size_gb", "gcp_region_override", b"gcp_region_override", "gpu_limit", b"gpu_limit", "gpu_type", b"gpu_type", "machine_type", b"machine_type", "num_replicas", b"num_replicas", "reservation_affinity", b"reservation_affinity", "scheduling_strategy", b"scheduling_strategy", "timeout", b"timeout"]) -> None: ... + def ClearField(self, field_name: typing_extensions.Literal["boot_disk_size_gb", b"boot_disk_size_gb", "gcp_region_override", b"gcp_region_override", "gpu_limit", b"gpu_limit", "gpu_type", b"gpu_type", "machine_type", b"machine_type", "num_replicas", b"num_replicas", "reservation_affinity", b"reservation_affinity", "scheduling_strategy", b"scheduling_strategy", "tensorboard_resource_name", b"tensorboard_resource_name", "timeout", b"timeout"]) -> None: ... global___VertexAiResourceConfig = VertexAiResourceConfig From 5edfeddf5988aee91d77d28524bbdd45de51cefb Mon Sep 17 00:00:00 2001 From: kmontemayor Date: Tue, 28 Apr 2026 18:55:58 +0000 Subject: [PATCH 02/59] Enable Vertex AI TensorBoard for trainer jobs --- examples/link_prediction/README.md | 11 +++ .../configs/e2e_het_dblp_sup_task_config.yaml | 1 + .../configs/e2e_hom_cora_sup_task_config.yaml | 1 + .../configs/example_resource_config.yaml | 1 + .../e2e_het_dblp_sup_gs_task_config.yaml | 1 + .../e2e_hom_cora_sup_gs_task_config.yaml | 1 + .../configs/example_resource_config.yaml | 1 + .../graph_store/heterogeneous_training.py | 62 +++++++++++- .../graph_store/homogeneous_training.py | 62 +++++++++++- .../link_prediction/heterogeneous_training.py | 63 ++++++++++++- .../link_prediction/homogeneous_training.py | 63 ++++++++++++- gigl/common/services/vertex_ai.py | 9 ++ gigl/src/common/constants/gcs.py | 2 +- gigl/src/common/utils/tensorboard.py | 94 +++++++++++++++++++ gigl/src/common/vertex_ai_launcher.py | 66 +++++++++++-- gigl/src/training/v1/lib/training_process.py | 17 +++- gigl/src/training/v1/trainer.py | 65 ++++++------- gigl/src/training/v2/glt_trainer.py | 11 +++ gigl/src/validation_check/config_validator.py | 6 ++ ...nd_resource_config_compatibility_checks.py | 40 ++++++++ .../unit/src/common/utils/tensorboard_test.py | 93 ++++++++++++++++++ .../src/common/vertex_ai_launcher_test.py | 21 +++++ tests/unit/src/common/vertex_ai_test.py | 65 +++++++++++++ .../config_populator_functionality_test.py | 2 + ...source_config_compatibility_checks_test.py | 76 +++++++++++++++ 25 files changed, 767 insertions(+), 67 deletions(-) create mode 100644 gigl/src/common/utils/tensorboard.py create mode 100644 tests/unit/src/common/utils/tensorboard_test.py create mode 100644 tests/unit/src/common/vertex_ai_test.py diff --git a/examples/link_prediction/README.md b/examples/link_prediction/README.md index cd730f595..f9f557caf 100644 --- a/examples/link_prediction/README.md +++ b/examples/link_prediction/README.md @@ -23,6 +23,17 @@ are example inference and training loops for the DBLP dataset. The DBLP dataset You can follow along with [dblp.ipynb](./dblp.ipynb) to run an e2e GiGL pipeline on the DBLP dataset. It will guide you through running each component: `config_populator` -> `data_preprocessor` -> `trainer` -> `inferencer` +## Vertex AI TensorBoard + +The example trainer configs enable TensorBoard logging with +`trainerConfig.shouldLogToTensorboard: true`. + +To surface those events in Vertex AI TensorBoard, set +`tensorboard_resource_name` on the trainer Vertex resource config, use a +regional bucket, and keep the bucket, CustomJob, and TensorBoard instance in +the same region. The attached service account should have +`roles/storage.admin` and `roles/aiplatform.user`. + ```{toctree} :maxdepth: 2 :hidden: diff --git a/examples/link_prediction/configs/e2e_het_dblp_sup_task_config.yaml b/examples/link_prediction/configs/e2e_het_dblp_sup_task_config.yaml index 8531fd081..3d4024c79 100644 --- a/examples/link_prediction/configs/e2e_het_dblp_sup_task_config.yaml +++ b/examples/link_prediction/configs/e2e_het_dblp_sup_task_config.yaml @@ -30,6 +30,7 @@ datasetConfig: # This argument is specific for the `PassthroughPreprocessorConfigForMockedAssets` preprocessor to indicate which dataset we should be using mocked_dataset_name: 'dblp_node_anchor_edge_features_lp' trainerConfig: + shouldLogToTensorboard: true trainerArgs: # Example argument to trainer log_every_n_batch: "50" diff --git a/examples/link_prediction/configs/e2e_hom_cora_sup_task_config.yaml b/examples/link_prediction/configs/e2e_hom_cora_sup_task_config.yaml index 606f13c29..845e7a9c8 100644 --- a/examples/link_prediction/configs/e2e_hom_cora_sup_task_config.yaml +++ b/examples/link_prediction/configs/e2e_hom_cora_sup_task_config.yaml @@ -13,6 +13,7 @@ datasetConfig: # This argument is specific for the `PassthroughPreprocessorConfigForMockedAssets` preprocessor to indicate which dataset we should be using mocked_dataset_name: 'cora_homogeneous_node_anchor_edge_features_user_defined_labels' trainerConfig: + shouldLogToTensorboard: true trainerArgs: # Example argument to trainer log_every_n_batch: "50" # Frequency in which we log batch information diff --git a/examples/link_prediction/configs/example_resource_config.yaml b/examples/link_prediction/configs/example_resource_config.yaml index 2b7d7a02a..b24557f9a 100644 --- a/examples/link_prediction/configs/example_resource_config.yaml +++ b/examples/link_prediction/configs/example_resource_config.yaml @@ -43,6 +43,7 @@ trainer_resource_config: gpu_type: NVIDIA_TESLA_T4 gpu_limit: 2 num_replicas: 2 + tensorboard_resource_name: "projects/USER_PROVIDED_PROJECT/locations/us-central1/tensorboards/USER_PROVIDED_TENSORBOARD_ID" inferencer_resource_config: vertex_ai_inferencer_config: machine_type: n1-standard-16 diff --git a/examples/link_prediction/graph_store/configs/e2e_het_dblp_sup_gs_task_config.yaml b/examples/link_prediction/graph_store/configs/e2e_het_dblp_sup_gs_task_config.yaml index 7c23186c7..36fc48ea6 100644 --- a/examples/link_prediction/graph_store/configs/e2e_het_dblp_sup_gs_task_config.yaml +++ b/examples/link_prediction/graph_store/configs/e2e_het_dblp_sup_gs_task_config.yaml @@ -30,6 +30,7 @@ datasetConfig: # This argument is specific for the `PassthroughPreprocessorConfigForMockedAssets` preprocessor to indicate which dataset we should be using mocked_dataset_name: 'dblp_node_anchor_edge_features_lp' trainerConfig: + shouldLogToTensorboard: true trainerArgs: # Example argument to trainer log_every_n_batch: "50" diff --git a/examples/link_prediction/graph_store/configs/e2e_hom_cora_sup_gs_task_config.yaml b/examples/link_prediction/graph_store/configs/e2e_hom_cora_sup_gs_task_config.yaml index 2283a2f91..faf4316b7 100644 --- a/examples/link_prediction/graph_store/configs/e2e_hom_cora_sup_gs_task_config.yaml +++ b/examples/link_prediction/graph_store/configs/e2e_hom_cora_sup_gs_task_config.yaml @@ -16,6 +16,7 @@ datasetConfig: # This argument is specific for the `PassthroughPreprocessorConfigForMockedAssets` preprocessor to indicate which dataset we should be using mocked_dataset_name: 'cora_homogeneous_node_anchor_edge_features_user_defined_labels' trainerConfig: + shouldLogToTensorboard: true trainerArgs: # Example argument to trainer log_every_n_batch: "50" # Frequency in which we log batch information diff --git a/examples/link_prediction/graph_store/configs/example_resource_config.yaml b/examples/link_prediction/graph_store/configs/example_resource_config.yaml index 869f627ca..a06f3192a 100644 --- a/examples/link_prediction/graph_store/configs/example_resource_config.yaml +++ b/examples/link_prediction/graph_store/configs/example_resource_config.yaml @@ -58,6 +58,7 @@ trainer_resource_config: gpu_type: NVIDIA_TESLA_T4 gpu_limit: 2 num_replicas: 2 + tensorboard_resource_name: "projects/USER_PROVIDED_PROJECT/locations/us-central1/tensorboards/USER_PROVIDED_TENSORBOARD_ID" inferencer_resource_config: vertex_ai_graph_store_inferencer_config: graph_store_pool: diff --git a/examples/link_prediction/graph_store/heterogeneous_training.py b/examples/link_prediction/graph_store/heterogeneous_training.py index ec42cf45a..bbfeb018d 100644 --- a/examples/link_prediction/graph_store/heterogeneous_training.py +++ b/examples/link_prediction/graph_store/heterogeneous_training.py @@ -113,6 +113,11 @@ ) from gigl.src.common.types.pb_wrappers.gbml_config import GbmlConfigPbWrapper from gigl.src.common.utils.model import load_state_dict_from_uri, save_state_dict +from gigl.src.common.utils.tensorboard import ( + close_tensorboard_writer, + create_tensorboard_writer, + write_tensorboard_scalar, +) from gigl.utils.iterator import InfiniteIterator from gigl.utils.sampling import parse_fanout @@ -372,6 +377,7 @@ class TrainingProcessArgs: sharing between local processes. supervision_edge_type (EdgeType): The supervision edge type for training. model_uri (Uri): URI to save/load the trained model state dict. + tensorboard_log_uri (Optional[Uri]): Destination URI for TensorBoard logs. hid_dim (int): Hidden dimension of the model. out_dim (int): Output dimension of the model. node_type_to_feature_dim (dict[NodeType, int]): Mapping of node types to their feature dimensions. @@ -388,6 +394,7 @@ class TrainingProcessArgs: num_val_batches (int): Number of validation batches across all processes. val_every_n_batch (int): Frequency to run validation during training. log_every_n_batch (int): Frequency to log batch information during training. + should_log_to_tensorboard (bool): If True, emit TensorBoard summaries. should_skip_training (bool): If True, skip training and only run testing. """ @@ -401,6 +408,7 @@ class TrainingProcessArgs: # Model model_uri: Uri eval_metrics_uri: Optional[Uri] + tensorboard_log_uri: Optional[Uri] hid_dim: int out_dim: int node_type_to_feature_dim: dict[NodeType, int] @@ -421,6 +429,7 @@ class TrainingProcessArgs: num_val_batches: int val_every_n_batch: int log_every_n_batch: int + should_log_to_tensorboard: bool should_skip_training: bool @@ -459,12 +468,18 @@ def _training_process( if torch.cuda.is_available(): torch.cuda.set_device(device) print(f"---Rank {rank} training process set device {device}") + tensorboard_writer = create_tensorboard_writer( + should_log_to_tensorboard=args.should_log_to_tensorboard, + configured_tensorboard_log_uri=args.tensorboard_log_uri, + should_write_events=rank == 0, + ) loss_fn = RetrievalLoss( loss=torch.nn.CrossEntropyLoss(reduction="mean"), temperature=0.07, remove_accidental_hits=True, ) + batch_idx = 0 if not args.should_skip_training: train_main_loader, train_random_negative_loader = _setup_dataloaders( @@ -525,7 +540,6 @@ def _training_process( # Entering the training loop training_start_time = time.time() - batch_idx = 0 avg_train_loss = 0.0 last_n_batch_avg_loss: list[float] = [] last_n_batch_time: list[float] = [] @@ -567,17 +581,31 @@ def _training_process( if ( batch_idx % args.log_every_n_batch == 0 or batch_idx < 10 ): # Log the first 10 batches to ensure the model is initialized correctly + mean_batch_time = statistics.mean(last_n_batch_time) + mean_train_loss = statistics.mean(last_n_batch_avg_loss) print( f"rank={rank}, batch={batch_idx}, latest local train_loss={loss:.6f}" ) if torch.cuda.is_available(): torch.cuda.synchronize() print( - f"rank={rank}, batch={batch_idx}, mean(batch_time)={statistics.mean(last_n_batch_time):.3f} sec, max(batch_time)={max(last_n_batch_time):.3f} sec, min(batch_time)={min(last_n_batch_time):.3f} sec" + f"rank={rank}, batch={batch_idx}, mean(batch_time)={mean_batch_time:.3f} sec, max(batch_time)={max(last_n_batch_time):.3f} sec, min(batch_time)={min(last_n_batch_time):.3f} sec" + ) + write_tensorboard_scalar( + writer=tensorboard_writer, + tag="Time/batch_mean_sec", + value=mean_batch_time, + step=batch_idx, ) last_n_batch_time.clear() print( - f"rank={rank}, latest avg_train_loss={avg_train_loss:.6f}, last {args.log_every_n_batch} mean(avg_train_loss)={statistics.mean(last_n_batch_avg_loss):.6f}" + f"rank={rank}, latest avg_train_loss={avg_train_loss:.6f}, last {args.log_every_n_batch} mean(avg_train_loss)={mean_train_loss:.6f}" + ) + write_tensorboard_scalar( + writer=tensorboard_writer, + tag="Loss/train", + value=mean_train_loss, + step=batch_idx, ) last_n_batch_avg_loss.clear() flush() @@ -585,7 +613,7 @@ def _training_process( if batch_idx % args.val_every_n_batch == 0: print(f"rank={rank}, batch={batch_idx}, validating...") model.eval() - _run_validation_loops( + global_avg_val_loss = _run_validation_loops( model=model, main_loader=val_main_loader_iter, random_negative_loader=val_random_negative_loader_iter, @@ -596,6 +624,12 @@ def _training_process( log_every_n_batch=args.log_every_n_batch, num_batches=num_val_batches_per_process, ) + write_tensorboard_scalar( + writer=tensorboard_writer, + tag="Loss/val", + value=global_avg_val_loss, + step=batch_idx, + ) model.train() else: print(f"rank={rank} ended training early - no break condition was met") @@ -674,6 +708,12 @@ def _training_process( device=device, log_every_n_batch=args.log_every_n_batch, ) + write_tensorboard_scalar( + writer=tensorboard_writer, + tag="Loss/test", + value=global_avg_test_loss, + step=batch_idx, + ) # Memory cleanup and waiting for all processes to finish if torch.cuda.is_available(): @@ -701,6 +741,7 @@ def _training_process( f"---Rank {rank} finished testing in {time.time() - testing_start_time:.3f} seconds" ) flush() + close_tensorboard_writer(tensorboard_writer) # Graph store mode cleanup: shutdown the compute process connection to the storage cluster. shutdown_compute_proccess() @@ -926,7 +967,18 @@ def _run_example_training( eval_metrics_uri: Optional[Uri] = ( UriFactory.create_uri(raw_eval_metrics_uri) if raw_eval_metrics_uri else None ) + raw_tensorboard_log_uri = ( + gbml_config_pb_wrapper.gbml_config_pb.shared_config.trained_model_metadata.tensorboard_logs_uri + ) + tensorboard_log_uri: Optional[Uri] = ( + UriFactory.create_uri(raw_tensorboard_log_uri) + if raw_tensorboard_log_uri + else None + ) + should_log_to_tensorboard = ( + gbml_config_pb_wrapper.trainer_config.should_log_to_tensorboard + ) should_skip_training = gbml_config_pb_wrapper.shared_config.should_skip_training supervision_edge_types = ( @@ -949,6 +1001,7 @@ def _run_example_training( supervision_edge_type=supervision_edge_type, model_uri=model_uri, eval_metrics_uri=eval_metrics_uri, + tensorboard_log_uri=tensorboard_log_uri, hid_dim=hid_dim, out_dim=out_dim, node_type_to_feature_dim=node_type_to_feature_dim, @@ -965,6 +1018,7 @@ def _run_example_training( num_val_batches=num_val_batches, val_every_n_batch=val_every_n_batch, log_every_n_batch=log_every_n_batch, + should_log_to_tensorboard=should_log_to_tensorboard, should_skip_training=should_skip_training, ) diff --git a/examples/link_prediction/graph_store/homogeneous_training.py b/examples/link_prediction/graph_store/homogeneous_training.py index e972edac2..ec1ad7b42 100644 --- a/examples/link_prediction/graph_store/homogeneous_training.py +++ b/examples/link_prediction/graph_store/homogeneous_training.py @@ -157,6 +157,11 @@ ) from gigl.src.common.types.pb_wrappers.gbml_config import GbmlConfigPbWrapper from gigl.src.common.utils.model import load_state_dict_from_uri, save_state_dict +from gigl.src.common.utils.tensorboard import ( + close_tensorboard_writer, + create_tensorboard_writer, + write_tensorboard_scalar, +) from gigl.utils.iterator import InfiniteIterator from gigl.utils.sampling import parse_fanout @@ -367,6 +372,7 @@ class TrainingProcessArgs: model_uri (Uri): URI to save/load the trained model state dict. eval_metrics_uri (Optional[Uri]): Destination URI for writing evaluation metrics in KFP-compatible JSON format. If None, metrics are not written. + tensorboard_log_uri (Optional[Uri]): Destination URI for TensorBoard logs. hid_dim (int): Hidden dimension of the model. out_dim (int): Output dimension of the model. node_feature_dim (int): Input node feature dimension for the model. @@ -383,6 +389,7 @@ class TrainingProcessArgs: num_val_batches (int): Number of validation batches across all processes. val_every_n_batch (int): Frequency to run validation during training. log_every_n_batch (int): Frequency to log batch information during training. + should_log_to_tensorboard (bool): If True, emit TensorBoard summaries. should_skip_training (bool): If True, skip training and only run testing. """ @@ -393,6 +400,7 @@ class TrainingProcessArgs: # Model model_uri: Uri eval_metrics_uri: Optional[Uri] + tensorboard_log_uri: Optional[Uri] hid_dim: int out_dim: int node_feature_dim: int @@ -413,6 +421,7 @@ class TrainingProcessArgs: num_val_batches: int val_every_n_batch: int log_every_n_batch: int + should_log_to_tensorboard: bool should_skip_training: bool @@ -450,12 +459,18 @@ def _training_process( if torch.cuda.is_available(): torch.cuda.set_device(device) logger.info(f"---Rank {rank} training process set device {device}") + tensorboard_writer = create_tensorboard_writer( + should_log_to_tensorboard=args.should_log_to_tensorboard, + configured_tensorboard_log_uri=args.tensorboard_log_uri, + should_write_events=rank == 0, + ) loss_fn = RetrievalLoss( loss=torch.nn.CrossEntropyLoss(reduction="mean"), temperature=0.07, remove_accidental_hits=True, ) + batch_idx = 0 if not args.should_skip_training: train_main_loader, train_random_negative_loader = _setup_dataloaders( @@ -517,7 +532,6 @@ def _training_process( # Entering the training loop training_start_time = time.time() - batch_idx = 0 avg_train_loss = 0.0 last_n_batch_avg_loss: list[float] = [] last_n_batch_time: list[float] = [] @@ -555,17 +569,31 @@ def _training_process( batch_start = time.time() batch_idx += 1 if batch_idx % args.log_every_n_batch == 0: + mean_batch_time = statistics.mean(last_n_batch_time) + mean_train_loss = statistics.mean(last_n_batch_avg_loss) logger.info( f"rank={rank}, batch={batch_idx}, latest local train_loss={loss:.6f}" ) if torch.cuda.is_available(): torch.cuda.synchronize() logger.info( - f"rank={rank}, mean(batch_time)={statistics.mean(last_n_batch_time):.3f} sec, max(batch_time)={max(last_n_batch_time):.3f} sec, min(batch_time)={min(last_n_batch_time):.3f} sec" + f"rank={rank}, mean(batch_time)={mean_batch_time:.3f} sec, max(batch_time)={max(last_n_batch_time):.3f} sec, min(batch_time)={min(last_n_batch_time):.3f} sec" + ) + write_tensorboard_scalar( + writer=tensorboard_writer, + tag="Time/batch_mean_sec", + value=mean_batch_time, + step=batch_idx, ) last_n_batch_time.clear() logger.info( - f"rank={rank}, latest avg_train_loss={avg_train_loss:.6f}, last {args.log_every_n_batch} mean(avg_train_loss)={statistics.mean(last_n_batch_avg_loss):.6f}" + f"rank={rank}, latest avg_train_loss={avg_train_loss:.6f}, last {args.log_every_n_batch} mean(avg_train_loss)={mean_train_loss:.6f}" + ) + write_tensorboard_scalar( + writer=tensorboard_writer, + tag="Loss/train", + value=mean_train_loss, + step=batch_idx, ) last_n_batch_avg_loss.clear() flush() @@ -573,7 +601,7 @@ def _training_process( if batch_idx % args.val_every_n_batch == 0: logger.info(f"rank={rank}, batch={batch_idx}, validating...") model.eval() - _run_validation_loops( + global_avg_val_loss = _run_validation_loops( model=model, main_loader=val_main_loader_iter, random_negative_loader=val_random_negative_loader_iter, @@ -582,6 +610,12 @@ def _training_process( log_every_n_batch=args.log_every_n_batch, num_batches=num_val_batches_per_process, ) + write_tensorboard_scalar( + writer=tensorboard_writer, + tag="Loss/val", + value=global_avg_val_loss, + step=batch_idx, + ) model.train() logger.info(f"---Rank {rank} finished training") @@ -657,6 +691,12 @@ def _training_process( device=device, log_every_n_batch=args.log_every_n_batch, ) + write_tensorboard_scalar( + writer=tensorboard_writer, + tag="Loss/test", + value=global_avg_test_loss, + step=batch_idx, + ) # Memory cleanup and waiting for all processes to finish if torch.cuda.is_available(): @@ -684,6 +724,7 @@ def _training_process( f"---Rank {rank} finished testing in {time.time() - testing_start_time:.3f} seconds" ) flush() + close_tensorboard_writer(tensorboard_writer) # Graph store mode cleanup: shutdown the compute process connection to the storage cluster. shutdown_compute_proccess() @@ -913,7 +954,18 @@ def _run_example_training( eval_metrics_uri: Optional[Uri] = ( UriFactory.create_uri(raw_eval_metrics_uri) if raw_eval_metrics_uri else None ) + raw_tensorboard_log_uri = ( + gbml_config_pb_wrapper.gbml_config_pb.shared_config.trained_model_metadata.tensorboard_logs_uri + ) + tensorboard_log_uri: Optional[Uri] = ( + UriFactory.create_uri(raw_tensorboard_log_uri) + if raw_tensorboard_log_uri + else None + ) + should_log_to_tensorboard = ( + gbml_config_pb_wrapper.trainer_config.should_log_to_tensorboard + ) should_skip_training = gbml_config_pb_wrapper.shared_config.should_skip_training # Step 4: Spawn training processes @@ -926,6 +978,7 @@ def _run_example_training( cluster_info=cluster_info, model_uri=model_uri, eval_metrics_uri=eval_metrics_uri, + tensorboard_log_uri=tensorboard_log_uri, hid_dim=hid_dim, out_dim=out_dim, node_feature_dim=node_feature_dim, @@ -942,6 +995,7 @@ def _run_example_training( num_val_batches=num_val_batches, val_every_n_batch=val_every_n_batch, log_every_n_batch=log_every_n_batch, + should_log_to_tensorboard=should_log_to_tensorboard, should_skip_training=should_skip_training, ) diff --git a/examples/link_prediction/heterogeneous_training.py b/examples/link_prediction/heterogeneous_training.py index f0d58ca5e..bc43770c5 100644 --- a/examples/link_prediction/heterogeneous_training.py +++ b/examples/link_prediction/heterogeneous_training.py @@ -63,6 +63,11 @@ ) from gigl.src.common.types.pb_wrappers.gbml_config import GbmlConfigPbWrapper from gigl.src.common.utils.model import load_state_dict_from_uri, save_state_dict +from gigl.src.common.utils.tensorboard import ( + close_tensorboard_writer, + create_tensorboard_writer, + write_tensorboard_scalar, +) from gigl.utils.iterator import InfiniteIterator from gigl.utils.sampling import parse_fanout @@ -308,6 +313,7 @@ class TrainingProcessArgs: model_uri (Uri): URI to save/load the trained model state dict. eval_metrics_uri (Optional[Uri]): Destination URI for writing evaluation metrics in KFP-compatible JSON format. If None, metrics are not written. + tensorboard_log_uri (Optional[Uri]): Destination URI for TensorBoard logs. hid_dim (int): Hidden dimension of the model. out_dim (int): Output dimension of the model. node_type_to_feature_dim (dict[NodeType, int]): Mapping of node types to their feature @@ -329,6 +335,7 @@ class TrainingProcessArgs: num_val_batches (int): Number of validation batches across all processes. val_every_n_batch (int): Frequency to run validation during training. log_every_n_batch (int): Frequency to log batch information during training. + should_log_to_tensorboard (bool): If True, emit TensorBoard summaries. should_skip_training (bool): If True, skip training and only run testing. """ @@ -346,6 +353,7 @@ class TrainingProcessArgs: # Model model_uri: Uri eval_metrics_uri: Optional[Uri] + tensorboard_log_uri: Optional[Uri] hid_dim: int out_dim: int node_type_to_feature_dim: dict[NodeType, int] @@ -366,6 +374,7 @@ class TrainingProcessArgs: num_val_batches: int val_every_n_batch: int log_every_n_batch: int + should_log_to_tensorboard: bool should_skip_training: bool @@ -400,11 +409,18 @@ def _training_process( if torch.cuda.is_available(): torch.cuda.set_device(device) logger.info(f"---Rank {rank} training process set device {device}") + is_chief_process = args.machine_rank == 0 and local_rank == 0 + tensorboard_writer = create_tensorboard_writer( + should_log_to_tensorboard=args.should_log_to_tensorboard, + configured_tensorboard_log_uri=args.tensorboard_log_uri, + should_write_events=is_chief_process, + ) loss_fn = RetrievalLoss( loss=torch.nn.CrossEntropyLoss(reduction="mean"), temperature=0.07, remove_accidental_hits=True, ) + batch_idx = 0 if not args.should_skip_training: train_main_loader, train_random_negative_loader = _setup_dataloaders( @@ -469,7 +485,6 @@ def _training_process( # Entering the training loop training_start_time = time.time() - batch_idx = 0 avg_train_loss = 0.0 last_n_batch_avg_loss: list[float] = [] last_n_batch_time: list[float] = [] @@ -509,6 +524,8 @@ def _training_process( batch_start = time.time() batch_idx += 1 if batch_idx % args.log_every_n_batch == 0: + mean_batch_time = statistics.mean(last_n_batch_time) + mean_train_loss = statistics.mean(last_n_batch_avg_loss) logger.info( f"rank={rank}, batch={batch_idx}, latest local train_loss={loss:.6f}" ) @@ -516,19 +533,31 @@ def _training_process( # Wait for GPU operations to finish torch.cuda.synchronize() logger.info( - f"rank={rank}, batch={batch_idx}, mean(batch_time)={statistics.mean(last_n_batch_time):.3f} sec, max(batch_time)={max(last_n_batch_time):.3f} sec, min(batch_time)={min(last_n_batch_time):.3f} sec" + f"rank={rank}, batch={batch_idx}, mean(batch_time)={mean_batch_time:.3f} sec, max(batch_time)={max(last_n_batch_time):.3f} sec, min(batch_time)={min(last_n_batch_time):.3f} sec" + ) + write_tensorboard_scalar( + writer=tensorboard_writer, + tag="Time/batch_mean_sec", + value=mean_batch_time, + step=batch_idx, ) last_n_batch_time.clear() # log the global average training loss logger.info( - f"rank={rank}, latest avg_train_loss={avg_train_loss:.6f}, last {args.log_every_n_batch} mean(avg_train_loss)={statistics.mean(last_n_batch_avg_loss):.6f}" + f"rank={rank}, latest avg_train_loss={avg_train_loss:.6f}, last {args.log_every_n_batch} mean(avg_train_loss)={mean_train_loss:.6f}" + ) + write_tensorboard_scalar( + writer=tensorboard_writer, + tag="Loss/train", + value=mean_train_loss, + step=batch_idx, ) last_n_batch_avg_loss.clear() if batch_idx % args.val_every_n_batch == 0: logger.info(f"rank={rank}, batch={batch_idx}, validating...") model.eval() - _run_validation_loops( + global_avg_val_loss = _run_validation_loops( model=model, main_loader=val_main_loader_iter, random_negative_loader=val_random_negative_loader_iter, @@ -538,6 +567,12 @@ def _training_process( log_every_n_batch=args.log_every_n_batch, num_batches=num_val_batches_per_process, ) + write_tensorboard_scalar( + writer=tensorboard_writer, + tag="Loss/val", + value=global_avg_val_loss, + step=batch_idx, + ) model.train() logger.info(f"---Rank {rank} finished training") @@ -619,6 +654,12 @@ def _training_process( device=device, log_every_n_batch=args.log_every_n_batch, ) + write_tensorboard_scalar( + writer=tensorboard_writer, + tag="Loss/test", + value=global_avg_test_loss, + step=batch_idx, + ) # Memory cleanup and waiting for all processes to finish if torch.cuda.is_available(): @@ -648,6 +689,7 @@ def _training_process( logger.info( f"---Rank {rank} finished testing in {time.time() - testing_start_time:.3f} seconds" ) + close_tensorboard_writer(tensorboard_writer) torch.distributed.destroy_process_group() @@ -881,7 +923,18 @@ def _run_example_training( eval_metrics_uri: Optional[Uri] = ( UriFactory.create_uri(raw_eval_metrics_uri) if raw_eval_metrics_uri else None ) + raw_tensorboard_log_uri = ( + gbml_config_pb_wrapper.gbml_config_pb.shared_config.trained_model_metadata.tensorboard_logs_uri + ) + tensorboard_log_uri: Optional[Uri] = ( + UriFactory.create_uri(raw_tensorboard_log_uri) + if raw_tensorboard_log_uri + else None + ) + should_log_to_tensorboard = ( + gbml_config_pb_wrapper.trainer_config.should_log_to_tensorboard + ) should_skip_training = gbml_config_pb_wrapper.shared_config.should_skip_training supervision_edge_types = ( @@ -906,6 +959,7 @@ def _run_example_training( supervision_edge_type=supervision_edge_type, model_uri=model_uri, eval_metrics_uri=eval_metrics_uri, + tensorboard_log_uri=tensorboard_log_uri, hid_dim=hid_dim, out_dim=out_dim, node_type_to_feature_dim=node_type_to_feature_dim, @@ -922,6 +976,7 @@ def _run_example_training( num_val_batches=num_val_batches, val_every_n_batch=val_every_n_batch, log_every_n_batch=log_every_n_batch, + should_log_to_tensorboard=should_log_to_tensorboard, should_skip_training=should_skip_training, ) diff --git a/examples/link_prediction/homogeneous_training.py b/examples/link_prediction/homogeneous_training.py index b95a77489..adfe8dff3 100644 --- a/examples/link_prediction/homogeneous_training.py +++ b/examples/link_prediction/homogeneous_training.py @@ -58,6 +58,11 @@ ) from gigl.src.common.types.pb_wrappers.gbml_config import GbmlConfigPbWrapper from gigl.src.common.utils.model import load_state_dict_from_uri, save_state_dict +from gigl.src.common.utils.tensorboard import ( + close_tensorboard_writer, + create_tensorboard_writer, + write_tensorboard_scalar, +) from gigl.types.graph import to_homogeneous from gigl.utils.iterator import InfiniteIterator from gigl.utils.sampling import parse_fanout @@ -269,6 +274,7 @@ class TrainingProcessArgs: model_uri (Uri): URI to save/load the trained model state dict. eval_metrics_uri (Optional[Uri]): Destination URI for writing evaluation metrics in KFP-compatible JSON format. If None, metrics are not written. + tensorboard_log_uri (Optional[Uri]): Destination URI for TensorBoard logs. hid_dim (int): Hidden dimension of the model. out_dim (int): Output dimension of the model. node_feature_dim (int): Input node feature dimension for the model. @@ -288,6 +294,7 @@ class TrainingProcessArgs: num_val_batches (int): Number of validation batches across all processes. val_every_n_batch (int): Frequency to run validation during training. log_every_n_batch (int): Frequency to log batch information during training. + should_log_to_tensorboard (bool): If True, emit TensorBoard summaries. should_skip_training (bool): If True, skip training and only run testing. """ @@ -304,6 +311,7 @@ class TrainingProcessArgs: # Model model_uri: Uri eval_metrics_uri: Optional[Uri] + tensorboard_log_uri: Optional[Uri] hid_dim: int out_dim: int node_feature_dim: int @@ -324,6 +332,7 @@ class TrainingProcessArgs: num_val_batches: int val_every_n_batch: int log_every_n_batch: int + should_log_to_tensorboard: bool should_skip_training: bool @@ -359,12 +368,19 @@ def _training_process( logger.info(f"---Rank {rank} training process set device {device}") logger.info(f"---Rank {rank} training process group initialized") + is_chief_process = args.machine_rank == 0 and local_rank == 0 + tensorboard_writer = create_tensorboard_writer( + should_log_to_tensorboard=args.should_log_to_tensorboard, + configured_tensorboard_log_uri=args.tensorboard_log_uri, + should_write_events=is_chief_process, + ) loss_fn = RetrievalLoss( loss=torch.nn.CrossEntropyLoss(reduction="mean"), temperature=0.07, remove_accidental_hits=True, ) + batch_idx = 0 if not args.should_skip_training: train_main_loader, train_random_negative_loader = _setup_dataloaders( @@ -429,7 +445,6 @@ def _training_process( # Entering the training loop training_start_time = time.time() - batch_idx = 0 avg_train_loss = 0.0 last_n_batch_avg_loss: list[float] = [] last_n_batch_time: list[float] = [] @@ -468,6 +483,8 @@ def _training_process( batch_start = time.time() batch_idx += 1 if batch_idx % args.log_every_n_batch == 0: + mean_batch_time = statistics.mean(last_n_batch_time) + mean_train_loss = statistics.mean(last_n_batch_avg_loss) logger.info( f"rank={rank}, batch={batch_idx}, latest local train_loss={loss:.6f}" ) @@ -475,19 +492,31 @@ def _training_process( # Wait for GPU operations to finish torch.cuda.synchronize() logger.info( - f"rank={rank}, mean(batch_time)={statistics.mean(last_n_batch_time):.3f} sec, max(batch_time)={max(last_n_batch_time):.3f} sec, min(batch_time)={min(last_n_batch_time):.3f} sec" + f"rank={rank}, mean(batch_time)={mean_batch_time:.3f} sec, max(batch_time)={max(last_n_batch_time):.3f} sec, min(batch_time)={min(last_n_batch_time):.3f} sec" + ) + write_tensorboard_scalar( + writer=tensorboard_writer, + tag="Time/batch_mean_sec", + value=mean_batch_time, + step=batch_idx, ) last_n_batch_time.clear() # log the global average training loss logger.info( - f"rank={rank}, latest avg_train_loss={avg_train_loss:.6f}, last {args.log_every_n_batch} mean(avg_train_loss)={statistics.mean(last_n_batch_avg_loss):.6f}" + f"rank={rank}, latest avg_train_loss={avg_train_loss:.6f}, last {args.log_every_n_batch} mean(avg_train_loss)={mean_train_loss:.6f}" + ) + write_tensorboard_scalar( + writer=tensorboard_writer, + tag="Loss/train", + value=mean_train_loss, + step=batch_idx, ) last_n_batch_avg_loss.clear() if batch_idx % args.val_every_n_batch == 0: logger.info(f"rank={rank}, batch={batch_idx}, validating...") model.eval() - _run_validation_loops( + global_avg_val_loss = _run_validation_loops( model=model, main_loader=val_main_loader_iter, random_negative_loader=val_random_negative_loader_iter, @@ -496,6 +525,12 @@ def _training_process( log_every_n_batch=args.log_every_n_batch, num_batches=num_val_batches_per_process, ) + write_tensorboard_scalar( + writer=tensorboard_writer, + tag="Loss/val", + value=global_avg_val_loss, + step=batch_idx, + ) model.train() logger.info(f"---Rank {rank} finished training") @@ -573,6 +608,12 @@ def _training_process( device=device, log_every_n_batch=args.log_every_n_batch, ) + write_tensorboard_scalar( + writer=tensorboard_writer, + tag="Loss/test", + value=global_avg_test_loss, + step=batch_idx, + ) # Memory cleanup and waiting for all processes to finish if torch.cuda.is_available(): @@ -602,6 +643,7 @@ def _training_process( logger.info( f"---Rank {rank} finished testing in {time.time() - testing_start_time:.3f} seconds" ) + close_tensorboard_writer(tensorboard_writer) torch.distributed.destroy_process_group() @@ -817,7 +859,18 @@ def _run_example_training( eval_metrics_uri: Optional[Uri] = ( UriFactory.create_uri(raw_eval_metrics_uri) if raw_eval_metrics_uri else None ) + raw_tensorboard_log_uri = ( + gbml_config_pb_wrapper.gbml_config_pb.shared_config.trained_model_metadata.tensorboard_logs_uri + ) + tensorboard_log_uri: Optional[Uri] = ( + UriFactory.create_uri(raw_tensorboard_log_uri) + if raw_tensorboard_log_uri + else None + ) + should_log_to_tensorboard = ( + gbml_config_pb_wrapper.trainer_config.should_log_to_tensorboard + ) should_skip_training = gbml_config_pb_wrapper.shared_config.should_skip_training logger.info("--- Launching training processes ...\n") @@ -832,6 +885,7 @@ def _run_example_training( dataset=dataset, model_uri=model_uri, eval_metrics_uri=eval_metrics_uri, + tensorboard_log_uri=tensorboard_log_uri, hid_dim=hid_dim, out_dim=out_dim, node_feature_dim=node_feature_dim, @@ -848,6 +902,7 @@ def _run_example_training( num_val_batches=num_val_batches, val_every_n_batch=val_every_n_batch, log_every_n_batch=log_every_n_batch, + should_log_to_tensorboard=should_log_to_tensorboard, should_skip_training=should_skip_training, ) diff --git a/gigl/common/services/vertex_ai.py b/gigl/common/services/vertex_ai.py index 6cbb968b3..4b1582aeb 100644 --- a/gigl/common/services/vertex_ai.py +++ b/gigl/common/services/vertex_ai.py @@ -135,6 +135,11 @@ class VertexAiJobConfig: reservation_affinity: Optional ``ReservationAffinity`` that maps to ``MachineSpec.reservation_affinity``. ``None`` uses the Vertex AI default (no reservation). + base_output_dir: Optional CustomJob base output directory. When set, + Vertex AI derives ``AIP_MODEL_DIR``, ``AIP_CHECKPOINT_DIR``, and + ``AIP_TENSORBOARD_LOG_DIR`` from this directory. + tensorboard_resource_name: Optional existing Vertex AI TensorBoard + resource to attach to the job. """ job_name: str @@ -153,6 +158,8 @@ class VertexAiJobConfig: enable_web_access: bool = True scheduling_strategy: Optional[aiplatform.gapic.Scheduling.Strategy] = None reservation_affinity: Optional[ReservationAffinity] = None + base_output_dir: Optional[str] = None + tensorboard_resource_name: Optional[str] = None class VertexAIService: @@ -347,12 +354,14 @@ def _submit_job( location=self._location, labels=job_config.labels, staging_bucket=self._staging_bucket, + base_output_dir=job_config.base_output_dir, ) job.submit( service_account=self._service_account, timeout=job_config.timeout_s, enable_web_access=job_config.enable_web_access, scheduling_strategy=job_config.scheduling_strategy, + tensorboard=job_config.tensorboard_resource_name, ) job.wait_for_resource_creation() logger.info(f"Created job: {job.resource_name}") diff --git a/gigl/src/common/constants/gcs.py b/gigl/src/common/constants/gcs.py index 146845428..8c375bcd9 100644 --- a/gigl/src/common/constants/gcs.py +++ b/gigl/src/common/constants/gcs.py @@ -979,7 +979,7 @@ def get_tensorboard_logs_gcs_path( """ return GcsUri.join( get_trainer_asset_dir_gcs_path(applied_task_identifier=applied_task_identifier), - "tensorboard_logs/", + "logs/", ) diff --git a/gigl/src/common/utils/tensorboard.py b/gigl/src/common/utils/tensorboard.py new file mode 100644 index 000000000..48905e673 --- /dev/null +++ b/gigl/src/common/utils/tensorboard.py @@ -0,0 +1,94 @@ +"""Shared TensorBoard helpers for GiGL training entrypoints.""" + +import os +from typing import Any, Optional + +import tensorflow as tf + +from gigl.common import Uri + +VERTEX_TENSORBOARD_LOG_DIR_ENV_KEY = "AIP_TENSORBOARD_LOG_DIR" + + +def resolve_tensorboard_log_dir( + configured_tensorboard_log_uri: Optional[Uri], +) -> Optional[str]: + """Resolve the TensorBoard log directory for the current runtime. + + Vertex AI sets ``AIP_TENSORBOARD_LOG_DIR`` when ``baseOutputDirectory`` is + configured on a CustomJob. Outside Vertex AI, GiGL falls back to the + TensorBoard URI stored in the task config. + + Args: + configured_tensorboard_log_uri: The TensorBoard URI from GiGL config. + + Returns: + The resolved log directory, or ``None`` when no directory is available. + """ + vertex_tensorboard_log_dir = os.environ.get(VERTEX_TENSORBOARD_LOG_DIR_ENV_KEY) + if vertex_tensorboard_log_dir: + return vertex_tensorboard_log_dir + + if configured_tensorboard_log_uri is None: + return None + + return configured_tensorboard_log_uri.uri + + +def create_tensorboard_writer( + should_log_to_tensorboard: bool, + configured_tensorboard_log_uri: Optional[Uri], + should_write_events: bool, +) -> Optional[Any]: + """Create a TensorBoard summary writer when logging is enabled. + + Args: + should_log_to_tensorboard: Whether TensorBoard logging is enabled. + configured_tensorboard_log_uri: The TensorBoard URI from GiGL config. + should_write_events: Whether the current process should emit events. + + Returns: + A TensorBoard writer, or ``None`` when logging should be skipped. + """ + if not should_log_to_tensorboard or not should_write_events: + return None + + tensorboard_log_dir = resolve_tensorboard_log_dir( + configured_tensorboard_log_uri=configured_tensorboard_log_uri + ) + if tensorboard_log_dir is None: + return None + + return tf.summary.create_file_writer(tensorboard_log_dir) + + +def write_tensorboard_scalar( + writer: Optional[Any], + tag: str, + value: float, + step: int, +) -> None: + """Write a scalar TensorBoard event when a writer is available. + + Args: + writer: TensorBoard writer created by ``create_tensorboard_writer``. + tag: The TensorBoard series name. + value: Scalar value to log. + step: TensorBoard step for the event. + """ + if writer is None: + return + + with writer.as_default(): + tf.summary.scalar(tag, value, step=step) + writer.flush() + + +def close_tensorboard_writer(writer: Optional[Any]) -> None: + """Close a TensorBoard writer when one exists. + + Args: + writer: TensorBoard writer created by ``create_tensorboard_writer``. + """ + if writer is not None: + writer.close() diff --git a/gigl/src/common/vertex_ai_launcher.py b/gigl/src/common/vertex_ai_launcher.py index 64aa86a23..86730fcde 100644 --- a/gigl/src/common/vertex_ai_launcher.py +++ b/gigl/src/common/vertex_ai_launcher.py @@ -52,6 +52,7 @@ def launch_single_pool_job( cuda_docker_uri: Optional[str], component: GiGLComponents, vertex_ai_region: str, + tensorboard_logs_uri: Optional[Uri] = None, ) -> None: """Launch a single pool job on Vertex AI. @@ -67,6 +68,7 @@ def launch_single_pool_job( cuda_docker_uri: Docker image URI for GPU execution component: The GiGL component (Trainer or Inferencer) vertex_ai_region: The Vertex AI region to launch the job in + tensorboard_logs_uri: Optional TensorBoard log URI for trainer jobs """ if component not in _LAUNCHABLE_COMPONENTS: raise ValueError( @@ -85,11 +87,12 @@ def launch_single_pool_job( resource_config_uri=resource_config_uri, command_str=process_command, args=process_runtime_args, - use_cuda=is_cpu_execution, + use_cuda=not is_cpu_execution, container_uri=container_uri, vertex_ai_resource_config=vertex_ai_resource_config, env_vars=[env_var.EnvVar(name="TF_CPP_MIN_LOG_LEVEL", value="3")], labels=resource_config_wrapper.get_resource_labels(component=component), + tensorboard_logs_uri=tensorboard_logs_uri, ) logger.info(f"Launching {component.value} job with config: {job_config}") @@ -115,6 +118,7 @@ def launch_graph_store_enabled_job( cpu_docker_uri: Optional[str], cuda_docker_uri: Optional[str], component: GiGLComponents, + tensorboard_logs_uri: Optional[Uri] = None, ) -> None: """Launch a graph store enabled job on Vertex AI with separate storage and compute pools. @@ -131,6 +135,7 @@ def launch_graph_store_enabled_job( cpu_docker_uri: Docker image URI for CPU execution cuda_docker_uri: Docker image URI for GPU execution component: The GiGL component (Trainer or Inferencer) + tensorboard_logs_uri: Optional TensorBoard log URI for trainer jobs """ if component not in _LAUNCHABLE_COMPONENTS: raise ValueError( @@ -139,13 +144,16 @@ def launch_graph_store_enabled_job( storage_pool_config = vertex_ai_graph_store_config.graph_store_pool compute_pool_config = vertex_ai_graph_store_config.compute_pool - # Determine if CPU or GPU based on compute pool - is_cpu_execution = _determine_if_cpu_execution( + # Compute workers may use GPUs, but storage workers always run the CPU + # graph-store entrypoint. + is_compute_cpu_execution = _determine_if_cpu_execution( vertex_ai_resource_config=compute_pool_config ) cpu_docker_uri = cpu_docker_uri or DEFAULT_GIGL_RELEASE_SRC_IMAGE_CPU cuda_docker_uri = cuda_docker_uri or DEFAULT_GIGL_RELEASE_SRC_IMAGE_CUDA - container_uri = cpu_docker_uri if is_cpu_execution else cuda_docker_uri + compute_container_uri = ( + cpu_docker_uri if is_compute_cpu_execution else cuda_docker_uri + ) logger.info(f"Running {component.value} with command: {compute_commmand}") @@ -153,7 +161,7 @@ def launch_graph_store_enabled_job( vertex_ai_graph_store_config.compute_cluster_local_world_size ) if not num_compute_processes: - if is_cpu_execution: + if is_compute_cpu_execution: num_compute_processes = 1 else: num_compute_processes = vertex_ai_graph_store_config.compute_pool.gpu_limit @@ -176,11 +184,12 @@ def launch_graph_store_enabled_job( resource_config_uri=resource_config_uri, command_str=compute_commmand, args=compute_runtime_args, - use_cuda=is_cpu_execution, - container_uri=container_uri, + use_cuda=not is_compute_cpu_execution, + container_uri=compute_container_uri, vertex_ai_resource_config=compute_pool_config, env_vars=environment_variables, labels=labels, + tensorboard_logs_uri=tensorboard_logs_uri, ) # Create storage pool job config @@ -190,8 +199,8 @@ def launch_graph_store_enabled_job( resource_config_uri=resource_config_uri, command_str=storage_command, args=storage_args, - use_cuda=is_cpu_execution, - container_uri=container_uri, + use_cuda=False, + container_uri=cpu_docker_uri, vertex_ai_resource_config=storage_pool_config, env_vars=environment_variables, labels=labels, @@ -227,6 +236,7 @@ def _build_job_config( vertex_ai_resource_config: VertexAiResourceConfig, env_vars: list[env_var.EnvVar], labels: Optional[dict[str, str]] = None, + tensorboard_logs_uri: Optional[Uri] = None, ) -> VertexAiJobConfig: """Build a VertexAiJobConfig for training or inference jobs. @@ -247,6 +257,7 @@ def _build_job_config( machine type, GPU type, replica count, timeout, and scheduling strategy. env_vars (list[env_var.EnvVar]): Environment variables to set in the container. labels (Optional[dict[str, str]]): Labels to associate with the job. Defaults to None. + tensorboard_logs_uri (Optional[Uri]): TensorBoard log URI for trainer jobs. Returns: VertexAiJobConfig: A configuration object ready to be used with VertexAIService.launch_job(). @@ -262,6 +273,13 @@ def _build_job_config( ) command = command_str.strip().split(" ") + base_output_dir = ( + _get_base_output_dir_from_tensorboard_logs_uri( + tensorboard_logs_uri=tensorboard_logs_uri + ) + if tensorboard_logs_uri is not None + else None + ) job_config = VertexAiJobConfig( job_name=job_name, @@ -291,10 +309,40 @@ def _build_job_config( reservation_affinity=_build_reservation_affinity( vertex_ai_resource_config.reservation_affinity ), + base_output_dir=base_output_dir, + tensorboard_resource_name=( + vertex_ai_resource_config.tensorboard_resource_name or None + if base_output_dir is not None + else None + ), ) return job_config +def _get_base_output_dir_from_tensorboard_logs_uri( + tensorboard_logs_uri: Uri, +) -> str: + """Return the CustomJob base output directory for a TensorBoard log URI. + + Args: + tensorboard_logs_uri: GiGL TensorBoard log URI. This is expected to + point at the ``logs/`` directory underneath the trainer asset dir. + + Returns: + The parent directory to use as ``base_output_dir``. + + Raises: + ValueError: If the URI does not contain a parent directory. + """ + normalized_tensorboard_logs_uri = tensorboard_logs_uri.uri.rstrip("/") + base_output_dir, separator, _ = normalized_tensorboard_logs_uri.rpartition("/") + if not separator or not base_output_dir: + raise ValueError( + f"TensorBoard logs URI must include a parent directory, got {tensorboard_logs_uri.uri!r}." + ) + return base_output_dir + + def _build_reservation_affinity( affinity: VertexAiReservationAffinity, ) -> Optional[ReservationAffinity]: diff --git a/gigl/src/training/v1/lib/training_process.py b/gigl/src/training/v1/lib/training_process.py index 9d8e8f21b..db70147f4 100644 --- a/gigl/src/training/v1/lib/training_process.py +++ b/gigl/src/training/v1/lib/training_process.py @@ -8,7 +8,6 @@ from distutils.util import strtobool from typing import Any, Optional -import tensorflow as tf import torch import torch.distributed import torch.nn.parallel @@ -47,6 +46,7 @@ initialize_metrics, ) from gigl.src.common.utils.model import load_state_dict_from_uri +from gigl.src.common.utils.tensorboard import create_tensorboard_writer from gigl.src.common.utils.time import current_formatted_datetime from gigl.src.training.v1.lib.base_trainer import BaseTrainer @@ -216,14 +216,21 @@ def __run_training( ): trainer_instance.setup_for_training() logger.info(f"Starting training at {current_formatted_datetime()}") - tensorboard_log_uri = gbml_config_pb_wrapper.shared_config.trained_model_metadata.tensorboard_logs_uri + tensorboard_log_uri = ( + gbml_config_pb_wrapper.shared_config.trained_model_metadata.tensorboard_logs_uri + ) profiler = get_torch_profiler_instance( gbml_config_pb_wrapper=gbml_config_pb_wrapper ) - file_writer = None - if gbml_config_pb_wrapper.trainer_config.should_log_to_tensorboard: - file_writer = tf.summary.create_file_writer(tensorboard_log_uri) + configured_tensorboard_log_uri = ( + UriFactory.create_uri(tensorboard_log_uri) if tensorboard_log_uri else None + ) + file_writer = create_tensorboard_writer( + should_log_to_tensorboard=gbml_config_pb_wrapper.trainer_config.should_log_to_tensorboard, + configured_tensorboard_log_uri=configured_tensorboard_log_uri, + should_write_events=get_rank() == 0, + ) with file_writer.as_default() if file_writer else contextlib.nullcontext(): with ( diff --git a/gigl/src/training/v1/trainer.py b/gigl/src/training/v1/trainer.py index c1509ea54..0d4e24b08 100644 --- a/gigl/src/training/v1/trainer.py +++ b/gigl/src/training/v1/trainer.py @@ -2,7 +2,7 @@ from typing import Optional import torch -from google.cloud.aiplatform_v1.types import accelerator_type, env_var +from google.cloud.aiplatform_v1.types import accelerator_type from gigl.common import Uri, UriFactory from gigl.common.constants import ( @@ -10,11 +10,12 @@ DEFAULT_GIGL_RELEASE_SRC_IMAGE_CUDA, ) from gigl.common.logger import Logger -from gigl.common.services.vertex_ai import VertexAiJobConfig, VertexAIService from gigl.env.pipelines_config import get_resource_config from gigl.src.common.constants.components import GiGLComponents from gigl.src.common.types import AppliedTaskIdentifier +from gigl.src.common.types.pb_wrappers.gbml_config import GbmlConfigPbWrapper from gigl.src.common.utils.metrics_service_provider import initialize_metrics +from gigl.src.common.vertex_ai_launcher import launch_single_pool_job from gigl.src.training.v1.lib.training_process import GnnTrainingProcess from snapchat.research.gbml.gigl_resource_config_pb2 import ( LocalResourceConfig, @@ -43,42 +44,34 @@ def run( is_cpu_training = self._determine_if_cpu_training(trainer_config) if isinstance(trainer_config, VertexAiResourceConfig): - cpu_docker_uri = cpu_docker_uri or DEFAULT_GIGL_RELEASE_SRC_IMAGE_CPU - cuda_docker_uri = cuda_docker_uri or DEFAULT_GIGL_RELEASE_SRC_IMAGE_CUDA - container_uri = cpu_docker_uri if is_cpu_training else cuda_docker_uri - environment_variables: list[env_var.EnvVar] = [ - env_var.EnvVar(name="TF_CPP_MIN_LOG_LEVEL", value="3"), - ] - job_args = [ - f"--job_name={applied_task_identifier}", - f"--task_config_uri={task_config_uri}", - f"--resource_config_uri={resource_config_uri}", - ] + ([] if is_cpu_training else ["--use_cuda"]) - - job_config = VertexAiJobConfig( - job_name=applied_task_identifier, - container_uri=container_uri, - command=["python", "-m", "gigl.src.training.v1.lib.training_process"], - args=job_args, - environment_variables=environment_variables, - machine_type=trainer_config.machine_type, - accelerator_type=trainer_config.gpu_type.upper().replace("-", "_"), - accelerator_count=trainer_config.gpu_limit, - replica_count=trainer_config.num_replicas, - labels=resource_config.get_resource_labels( - component=GiGLComponents.Trainer - ), - timeout_s=trainer_config.timeout if trainer_config.timeout else None, + gbml_config_pb_wrapper = ( + GbmlConfigPbWrapper.get_gbml_config_pb_wrapper_from_uri( + gbml_config_uri=task_config_uri + ) ) - - vertex_ai_service = VertexAIService( - project=resource_config.project, - location=resource_config.region, - service_account=resource_config.service_account_email, - staging_bucket=resource_config.temp_assets_regional_bucket_path.uri, + raw_tensorboard_logs_uri = ( + gbml_config_pb_wrapper.shared_config.trained_model_metadata.tensorboard_logs_uri + ) + tensorboard_logs_uri = ( + UriFactory.create_uri(raw_tensorboard_logs_uri) + if gbml_config_pb_wrapper.trainer_config.should_log_to_tensorboard + and raw_tensorboard_logs_uri + else None + ) + launch_single_pool_job( + vertex_ai_resource_config=trainer_config, + job_name=str(applied_task_identifier), + task_config_uri=task_config_uri, + resource_config_uri=resource_config_uri, + process_command="python -m gigl.src.training.v1.lib.training_process", + process_runtime_args={}, + resource_config_wrapper=resource_config, + cpu_docker_uri=cpu_docker_uri or DEFAULT_GIGL_RELEASE_SRC_IMAGE_CPU, + cuda_docker_uri=cuda_docker_uri or DEFAULT_GIGL_RELEASE_SRC_IMAGE_CUDA, + component=GiGLComponents.Trainer, + vertex_ai_region=resource_config.vertex_ai_trainer_region, + tensorboard_logs_uri=tensorboard_logs_uri, ) - - vertex_ai_service.launch_job(job_config=job_config) elif isinstance(trainer_config, LocalResourceConfig): training_process = GnnTrainingProcess() diff --git a/gigl/src/training/v2/glt_trainer.py b/gigl/src/training/v2/glt_trainer.py index 2f8ecbbbe..eb50bc0f2 100644 --- a/gigl/src/training/v2/glt_trainer.py +++ b/gigl/src/training/v2/glt_trainer.py @@ -54,6 +54,15 @@ def __execute_VAI_training( training_process_runtime_args = ( gbml_config_pb_wrapper.trainer_config.trainer_args ) + raw_tensorboard_logs_uri = ( + gbml_config_pb_wrapper.shared_config.trained_model_metadata.tensorboard_logs_uri + ) + tensorboard_logs_uri = ( + UriFactory.create_uri(raw_tensorboard_logs_uri) + if gbml_config_pb_wrapper.trainer_config.should_log_to_tensorboard + and raw_tensorboard_logs_uri + else None + ) job_name = f"gigl_train_{applied_task_identifier}" @@ -70,6 +79,7 @@ def __execute_VAI_training( cuda_docker_uri=cuda_docker_uri, component=GiGLComponents.Trainer, vertex_ai_region=resource_config.vertex_ai_trainer_region, + tensorboard_logs_uri=tensorboard_logs_uri, ) elif isinstance(resource_config.trainer_config, VertexAiGraphStoreConfig): launch_graph_store_enabled_job( @@ -85,6 +95,7 @@ def __execute_VAI_training( cpu_docker_uri=cpu_docker_uri, cuda_docker_uri=cuda_docker_uri, component=GiGLComponents.Trainer, + tensorboard_logs_uri=tensorboard_logs_uri, ) else: raise NotImplementedError( diff --git a/gigl/src/validation_check/config_validator.py b/gigl/src/validation_check/config_validator.py index ec0ca4caf..2c6fa3d14 100644 --- a/gigl/src/validation_check/config_validator.py +++ b/gigl/src/validation_check/config_validator.py @@ -18,6 +18,7 @@ from gigl.src.validation_check.libs.gbml_and_resource_config_compatibility_checks import ( check_inferencer_graph_store_compatibility, check_trainer_graph_store_compatibility, + check_vertex_ai_trainer_tensorboard_compatibility, ) from gigl.src.validation_check.libs.name_checks import ( check_if_kfp_pipeline_job_name_valid, @@ -202,22 +203,27 @@ GiGLComponents.ConfigPopulator.value: [ check_trainer_graph_store_compatibility, check_inferencer_graph_store_compatibility, + check_vertex_ai_trainer_tensorboard_compatibility, ], GiGLComponents.DataPreprocessor.value: [ check_trainer_graph_store_compatibility, check_inferencer_graph_store_compatibility, + check_vertex_ai_trainer_tensorboard_compatibility, ], GiGLComponents.SubgraphSampler.value: [ check_trainer_graph_store_compatibility, check_inferencer_graph_store_compatibility, + check_vertex_ai_trainer_tensorboard_compatibility, ], GiGLComponents.SplitGenerator.value: [ check_trainer_graph_store_compatibility, check_inferencer_graph_store_compatibility, + check_vertex_ai_trainer_tensorboard_compatibility, ], GiGLComponents.Trainer.value: [ check_trainer_graph_store_compatibility, check_inferencer_graph_store_compatibility, + check_vertex_ai_trainer_tensorboard_compatibility, ], GiGLComponents.Inferencer.value: [ check_inferencer_graph_store_compatibility, diff --git a/gigl/src/validation_check/libs/gbml_and_resource_config_compatibility_checks.py b/gigl/src/validation_check/libs/gbml_and_resource_config_compatibility_checks.py index fc12d1939..ee6422c3b 100644 --- a/gigl/src/validation_check/libs/gbml_and_resource_config_compatibility_checks.py +++ b/gigl/src/validation_check/libs/gbml_and_resource_config_compatibility_checks.py @@ -102,6 +102,46 @@ def check_trainer_graph_store_compatibility( ) +def check_vertex_ai_trainer_tensorboard_compatibility( + gbml_config_pb_wrapper: GbmlConfigPbWrapper, + resource_config_wrapper: GiglResourceConfigWrapper, +) -> None: + """Check that Vertex AI trainer TensorBoard config is complete. + + Args: + gbml_config_pb_wrapper: The GbmlConfig wrapper. + resource_config_wrapper: The GiglResourceConfig wrapper. + + Raises: + AssertionError: If TensorBoard logging is enabled for a Vertex AI + trainer but no TensorBoard resource name is configured. + """ + logger.info( + "Config validation check: Vertex AI trainer TensorBoard compatibility between template and resource configs." + ) + + if not gbml_config_pb_wrapper.trainer_config.should_log_to_tensorboard: + return + + trainer_resource_config = resource_config_wrapper.trainer_config + if isinstance(trainer_resource_config, gigl_resource_config_pb2.VertexAiResourceConfig): + tensorboard_resource_name = trainer_resource_config.tensorboard_resource_name + elif isinstance( + trainer_resource_config, gigl_resource_config_pb2.VertexAiGraphStoreConfig + ): + tensorboard_resource_name = ( + trainer_resource_config.compute_pool.tensorboard_resource_name + ) + else: + return + + assert tensorboard_resource_name, ( + "GbmlConfig.trainer_config.should_log_to_tensorboard is true, so a " + "Vertex AI TensorBoard resource name must be set in the trainer " + "resource config." + ) + + def check_inferencer_graph_store_compatibility( gbml_config_pb_wrapper: GbmlConfigPbWrapper, resource_config_wrapper: GiglResourceConfigWrapper, diff --git a/tests/unit/src/common/utils/tensorboard_test.py b/tests/unit/src/common/utils/tensorboard_test.py new file mode 100644 index 000000000..c25d69110 --- /dev/null +++ b/tests/unit/src/common/utils/tensorboard_test.py @@ -0,0 +1,93 @@ +"""Unit tests for gigl.src.common.utils.tensorboard.""" + +import os +from unittest.mock import Mock, patch + +from absl.testing import absltest + +from gigl.common import UriFactory +from gigl.src.common.utils.tensorboard import ( + VERTEX_TENSORBOARD_LOG_DIR_ENV_KEY, + create_tensorboard_writer, + resolve_tensorboard_log_dir, + write_tensorboard_scalar, +) +from tests.test_assets.test_case import TestCase + + +class TestTensorboardUtils(TestCase): + """Tests for shared TensorBoard helpers.""" + + def test_resolve_tensorboard_log_dir_prefers_vertex_env(self) -> None: + configured_tensorboard_log_uri = UriFactory.create_uri( + "gs://perm-assets/job/trainer/logs/" + ) + + with patch.dict( + os.environ, + {VERTEX_TENSORBOARD_LOG_DIR_ENV_KEY: "gs://vertex-managed/logs"}, + clear=False, + ): + resolved_log_dir = resolve_tensorboard_log_dir( + configured_tensorboard_log_uri=configured_tensorboard_log_uri + ) + + self.assertEqual(resolved_log_dir, "gs://vertex-managed/logs") + + @patch("gigl.src.common.utils.tensorboard.tf.summary.create_file_writer") + def test_create_tensorboard_writer_uses_configured_uri_when_vertex_env_missing( + self, + mock_create_file_writer, + ) -> None: + configured_tensorboard_log_uri = UriFactory.create_uri( + "gs://perm-assets/job/trainer/logs/" + ) + writer = object() + mock_create_file_writer.return_value = writer + + created_writer = create_tensorboard_writer( + should_log_to_tensorboard=True, + configured_tensorboard_log_uri=configured_tensorboard_log_uri, + should_write_events=True, + ) + + self.assertIs(created_writer, writer) + mock_create_file_writer.assert_called_once_with( + configured_tensorboard_log_uri.uri + ) + + @patch("gigl.src.common.utils.tensorboard.tf.summary.create_file_writer") + def test_create_tensorboard_writer_skips_non_chief_process( + self, + mock_create_file_writer, + ) -> None: + created_writer = create_tensorboard_writer( + should_log_to_tensorboard=True, + configured_tensorboard_log_uri=UriFactory.create_uri( + "gs://perm-assets/job/trainer/logs/" + ), + should_write_events=False, + ) + + self.assertIsNone(created_writer) + mock_create_file_writer.assert_not_called() + + @patch("gigl.src.common.utils.tensorboard.tf.summary.scalar") + def test_write_tensorboard_scalar_flushes_writer(self, mock_summary_scalar) -> None: + writer = Mock() + writer.as_default.return_value.__enter__ = Mock(return_value=None) + writer.as_default.return_value.__exit__ = Mock(return_value=None) + + write_tensorboard_scalar( + writer=writer, + tag="Loss/train", + value=1.5, + step=10, + ) + + mock_summary_scalar.assert_called_once_with("Loss/train", 1.5, step=10) + writer.flush.assert_called_once() + + +if __name__ == "__main__": + absltest.main() diff --git a/tests/unit/src/common/vertex_ai_launcher_test.py b/tests/unit/src/common/vertex_ai_launcher_test.py index 8db251b44..f19eb0d93 100644 --- a/tests/unit/src/common/vertex_ai_launcher_test.py +++ b/tests/unit/src/common/vertex_ai_launcher_test.py @@ -59,6 +59,7 @@ def _create_gigl_resource_config_with_graph_store( gcp_region_override="us-west1", timeout=10800, scheduling_strategy="STANDARD", + tensorboard_resource_name="projects/test-project/locations/us-west1/tensorboards/test-tensorboard", ) storage_pool = gigl_resource_config_pb2.VertexAiResourceConfig( machine_type="n1-highmem-32", @@ -92,6 +93,7 @@ def _create_gigl_resource_config_with_single_pool_inference( machine_type="n1-standard-8", num_replicas=1, timeout=7200, + tensorboard_resource_name="projects/test-project/locations/us-central1/tensorboards/should-not-attach", ) # Create InferencerResourceConfig with single pool vertex AI config @@ -152,6 +154,7 @@ def test_launch_training_graph_store_cuda(self, mock_vertex_ai_service_class): cpu_docker_uri=cpu_docker_uri, cuda_docker_uri=cuda_docker_uri, component=component, + tensorboard_logs_uri=Uri("gs://test-perm-bucket/job-name/trainer/logs/"), ) # Assert - verify VertexAIService was instantiated correctly @@ -192,13 +195,28 @@ def test_launch_training_graph_store_cuda(self, mock_vertex_ai_service_class): self.assertIn( f"--epochs={process_runtime_args['epochs']}", compute_job_config.args ) + self.assertIn("--use_cuda", compute_job_config.args) + self.assertEqual( + compute_job_config.base_output_dir, + "gs://test-perm-bucket/job-name/trainer", + ) + self.assertEqual( + compute_job_config.tensorboard_resource_name, + compute_pool.tensorboard_resource_name, + ) # Verify storage pool config self.assertEqual(storage_job_config.machine_type, storage_pool.machine_type) + self.assertEqual(storage_job_config.container_uri, cpu_docker_uri) self.assertIn( "gigl.distributed.graph_store.storage_main", " ".join(storage_job_config.command), ) + self.assertIsNotNone(storage_job_config.args) + assert storage_job_config.args is not None # Type narrowing for mypy + self.assertNotIn("--use_cuda", storage_job_config.args) + self.assertIsNone(storage_job_config.base_output_dir) + self.assertIsNone(storage_job_config.tensorboard_resource_name) # Verify environment variables compute_env_vars = { @@ -304,6 +322,9 @@ def test_launch_inference_single_pool_cpu(self, mock_vertex_ai_service_class): self.assertIn( f"--output_path={process_runtime_args['output_path']}", job_config.args ) + self.assertNotIn("--use_cuda", job_config.args) + self.assertIsNone(job_config.base_output_dir) + self.assertIsNone(job_config.tensorboard_resource_name) # Verify resource labels expected_labels = { diff --git a/tests/unit/src/common/vertex_ai_test.py b/tests/unit/src/common/vertex_ai_test.py new file mode 100644 index 000000000..8047b35f3 --- /dev/null +++ b/tests/unit/src/common/vertex_ai_test.py @@ -0,0 +1,65 @@ +"""Unit tests for gigl.common.services.vertex_ai.""" + +from unittest.mock import Mock, patch + +from absl.testing import absltest + +from gigl.common.services.vertex_ai import VertexAIService, VertexAiJobConfig +from tests.test_assets.test_case import TestCase + + +class TestVertexAIService(TestCase): + """Tests for Vertex AI CustomJob submission plumbing.""" + + @patch("gigl.common.services.vertex_ai.aiplatform.CustomJob") + @patch("gigl.common.services.vertex_ai.aiplatform.init") + def test_submit_job_passes_tensorboard_and_base_output_dir( + self, + mock_aiplatform_init, + mock_custom_job_class, + ) -> None: + mock_job = Mock() + mock_job.resource_name = "projects/test/locations/us-central1/customJobs/123" + mock_job.name = "123" + mock_custom_job_class.return_value = mock_job + + service = VertexAIService( + project="test-project", + location="us-central1", + service_account="svc@test-project.iam.gserviceaccount.com", + staging_bucket="gs://test-staging-bucket", + ) + + job_config = VertexAiJobConfig( + job_name="test-job", + container_uri="gcr.io/test/image:latest", + command=["python", "-m", "trainer"], + base_output_dir="gs://test-perm-bucket/test-job/trainer", + tensorboard_resource_name=( + "projects/test-project/locations/us-central1/tensorboards/123" + ), + ) + + service.launch_job(job_config=job_config) + + mock_aiplatform_init.assert_called_once_with( + project="test-project", + location="us-central1", + staging_bucket="gs://test-staging-bucket", + ) + mock_custom_job_class.assert_called_once() + _, custom_job_kwargs = mock_custom_job_class.call_args + self.assertEqual( + custom_job_kwargs["base_output_dir"], + job_config.base_output_dir, + ) + mock_job.submit.assert_called_once() + _, submit_kwargs = mock_job.submit.call_args + self.assertEqual( + submit_kwargs["tensorboard"], + job_config.tensorboard_resource_name, + ) + + +if __name__ == "__main__": + absltest.main() diff --git a/tests/unit/src/config_populator/config_populator_functionality_test.py b/tests/unit/src/config_populator/config_populator_functionality_test.py index 440b4cc95..4ce71d1af 100644 --- a/tests/unit/src/config_populator/config_populator_functionality_test.py +++ b/tests/unit/src/config_populator/config_populator_functionality_test.py @@ -101,6 +101,7 @@ def test_sgs_config_population_is_accurate( ) self.assertNotEqual(trained_model_metadata_pb.trained_model_uri, "") self.assertNotEqual(trained_model_metadata_pb.scripted_model_uri, "") + self.assertTrue(trained_model_metadata_pb.tensorboard_logs_uri.endswith("/logs/")) # Assert inference metadata assets were set inference_metadata_pb: inference_metadata_pb2.InferenceMetadata = ( @@ -189,6 +190,7 @@ def test_glt_config_population_is_accurate( ) self.assertNotEqual(trained_model_metadata_pb.trained_model_uri, "") self.assertNotEqual(trained_model_metadata_pb.scripted_model_uri, "") + self.assertTrue(trained_model_metadata_pb.tensorboard_logs_uri.endswith("/logs/")) # Assert inference metadata assets were set inference_metadata_pb: inference_metadata_pb2.InferenceMetadata = ( diff --git a/tests/unit/src/validation/lib/gbml_and_resource_config_compatibility_checks_test.py b/tests/unit/src/validation/lib/gbml_and_resource_config_compatibility_checks_test.py index c70450501..734074987 100644 --- a/tests/unit/src/validation/lib/gbml_and_resource_config_compatibility_checks_test.py +++ b/tests/unit/src/validation/lib/gbml_and_resource_config_compatibility_checks_test.py @@ -7,6 +7,7 @@ from gigl.src.validation_check.libs.gbml_and_resource_config_compatibility_checks import ( check_inferencer_graph_store_compatibility, check_trainer_graph_store_compatibility, + check_vertex_ai_trainer_tensorboard_compatibility, ) from snapchat.research.gbml import gbml_config_pb2, gigl_resource_config_pb2 from tests.test_assets.test_case import TestCase @@ -94,6 +95,13 @@ def _create_gbml_config_without_graph_stores() -> GbmlConfigPbWrapper: return GbmlConfigPbWrapper(gbml_config_pb=gbml_config) +def _create_gbml_config_with_tensorboard_enabled() -> GbmlConfigPbWrapper: + """Create a GbmlConfig with trainer TensorBoard logging enabled.""" + gbml_config = gbml_config_pb2.GbmlConfig() + gbml_config.trainer_config.should_log_to_tensorboard = True + return GbmlConfigPbWrapper(gbml_config_pb=gbml_config) + + def _create_resource_config_with_both_graph_stores() -> GiglResourceConfigWrapper: """Create a GiglResourceConfig with VertexAiGraphStoreConfig for both trainer and inferencer.""" config = gigl_resource_config_pb2.GiglResourceConfig() @@ -126,6 +134,33 @@ def _create_resource_config_without_graph_stores() -> GiglResourceConfigWrapper: return GiglResourceConfigWrapper(resource_config=config) +def _create_resource_config_with_trainer_tensorboard( + *, + tensorboard_resource_name: str, + use_graph_store: bool = False, +) -> GiglResourceConfigWrapper: + """Create a GiglResourceConfig with a trainer TensorBoard resource.""" + config = gigl_resource_config_pb2.GiglResourceConfig() + _create_shared_resource_config(config) + + if use_graph_store: + graph_store_config = _create_vertex_ai_graph_store_config() + graph_store_config.compute_pool.tensorboard_resource_name = ( + tensorboard_resource_name + ) + config.trainer_resource_config.vertex_ai_graph_store_trainer_config.CopyFrom( + graph_store_config + ) + else: + vertex_ai_resource_config = _create_vertex_ai_resource_config() + vertex_ai_resource_config.tensorboard_resource_name = tensorboard_resource_name + config.trainer_resource_config.vertex_ai_trainer_config.CopyFrom( + vertex_ai_resource_config + ) + + return GiglResourceConfigWrapper(resource_config=config) + + class TestTrainerGraphStoreCompatibility(TestCase): """Test suite for trainer graph store compatibility checks.""" @@ -203,6 +238,47 @@ def test_template_has_inferencer_graph_store_resource_does_not(self): resource_config_wrapper=resource_config, ) + +class TestVertexAITrainerTensorboardCompatibility(TestCase): + """Test suite for Vertex AI trainer TensorBoard compatibility checks.""" + + def test_vertex_ai_trainer_tensorboard_config_present(self): + gbml_config = _create_gbml_config_with_tensorboard_enabled() + resource_config = _create_resource_config_with_trainer_tensorboard( + tensorboard_resource_name=( + "projects/test-project/locations/us-central1/tensorboards/test" + ) + ) + + check_vertex_ai_trainer_tensorboard_compatibility( + gbml_config_pb_wrapper=gbml_config, + resource_config_wrapper=resource_config, + ) + + def test_graph_store_trainer_tensorboard_config_present(self): + gbml_config = _create_gbml_config_with_tensorboard_enabled() + resource_config = _create_resource_config_with_trainer_tensorboard( + tensorboard_resource_name=( + "projects/test-project/locations/us-central1/tensorboards/test" + ), + use_graph_store=True, + ) + + check_vertex_ai_trainer_tensorboard_compatibility( + gbml_config_pb_wrapper=gbml_config, + resource_config_wrapper=resource_config, + ) + + def test_vertex_ai_trainer_tensorboard_missing_resource_name_raises(self): + gbml_config = _create_gbml_config_with_tensorboard_enabled() + resource_config = _create_resource_config_without_graph_stores() + + with self.assertRaises(AssertionError): + check_vertex_ai_trainer_tensorboard_compatibility( + gbml_config_pb_wrapper=gbml_config, + resource_config_wrapper=resource_config, + ) + def test_resource_has_inferencer_graph_store_template_does_not(self): """Test that resource having graph store but template not raises an assertion error.""" gbml_config = _create_gbml_config_without_graph_stores() From cde7d00bebd4b1816580e60e33219e8d4e62989a Mon Sep 17 00:00:00 2001 From: kmontemayor Date: Wed, 29 Apr 2026 15:47:54 +0000 Subject: [PATCH 03/59] test(tensorboard): add failing tests for TensorBoardWriter class Replace tests for the four-function API with tests for the new TensorBoardWriter class. Tests fail with ImportError until the class lands in the next commit. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../unit/src/common/utils/tensorboard_test.py | 162 ++++++++++-------- 1 file changed, 95 insertions(+), 67 deletions(-) diff --git a/tests/unit/src/common/utils/tensorboard_test.py b/tests/unit/src/common/utils/tensorboard_test.py index c25d69110..e494dcd0e 100644 --- a/tests/unit/src/common/utils/tensorboard_test.py +++ b/tests/unit/src/common/utils/tensorboard_test.py @@ -6,87 +6,115 @@ from absl.testing import absltest from gigl.common import UriFactory -from gigl.src.common.utils.tensorboard import ( - VERTEX_TENSORBOARD_LOG_DIR_ENV_KEY, - create_tensorboard_writer, - resolve_tensorboard_log_dir, - write_tensorboard_scalar, -) +from gigl.src.common.utils.tensorboard import TensorBoardWriter from tests.test_assets.test_case import TestCase -class TestTensorboardUtils(TestCase): - """Tests for shared TensorBoard helpers.""" +class TestTensorBoardWriter(TestCase): + """Tests for the TensorBoardWriter class.""" - def test_resolve_tensorboard_log_dir_prefers_vertex_env(self) -> None: - configured_tensorboard_log_uri = UriFactory.create_uri( - "gs://perm-assets/job/trainer/logs/" - ) + def test_from_uri_returns_noop_when_disabled(self) -> None: + configured_uri = UriFactory.create_uri("gs://config/logs/") + with patch( + "gigl.src.common.utils.tensorboard.tf.summary.create_file_writer" + ) as mock_create_file_writer: + writer = TensorBoardWriter.from_uri(configured_uri, enabled=False) + writer.log({"Loss/train": 1.0}, step=0) + writer.close() + mock_create_file_writer.assert_not_called() + + def test_from_uri_prefers_vertex_env_var(self) -> None: + configured_uri = UriFactory.create_uri("gs://config/logs/") with patch.dict( os.environ, - {VERTEX_TENSORBOARD_LOG_DIR_ENV_KEY: "gs://vertex-managed/logs"}, + {"AIP_TENSORBOARD_LOG_DIR": "gs://vertex-managed/logs"}, clear=False, ): - resolved_log_dir = resolve_tensorboard_log_dir( - configured_tensorboard_log_uri=configured_tensorboard_log_uri - ) + with patch( + "gigl.src.common.utils.tensorboard.tf.summary.create_file_writer" + ) as mock_create_file_writer: + TensorBoardWriter.from_uri(configured_uri) + + mock_create_file_writer.assert_called_once_with("gs://vertex-managed/logs") + + def test_from_uri_falls_back_to_configured_uri(self) -> None: + configured_uri = UriFactory.create_uri("gs://config/logs/") + with patch.dict(os.environ, {}, clear=True): + with patch( + "gigl.src.common.utils.tensorboard.tf.summary.create_file_writer" + ) as mock_create_file_writer: + TensorBoardWriter.from_uri(configured_uri) + + mock_create_file_writer.assert_called_once_with(configured_uri.uri) + + def test_from_uri_returns_noop_when_no_uri_anywhere(self) -> None: + with patch.dict(os.environ, {}, clear=True): + with patch( + "gigl.src.common.utils.tensorboard.tf.summary.create_file_writer" + ) as mock_create_file_writer: + writer = TensorBoardWriter.from_uri(configured_uri=None) + writer.log({"Loss/train": 1.0}, step=0) + writer.close() - self.assertEqual(resolved_log_dir, "gs://vertex-managed/logs") + mock_create_file_writer.assert_not_called() - @patch("gigl.src.common.utils.tensorboard.tf.summary.create_file_writer") - def test_create_tensorboard_writer_uses_configured_uri_when_vertex_env_missing( - self, - mock_create_file_writer, - ) -> None: - configured_tensorboard_log_uri = UriFactory.create_uri( - "gs://perm-assets/job/trainer/logs/" - ) - writer = object() - mock_create_file_writer.return_value = writer - - created_writer = create_tensorboard_writer( - should_log_to_tensorboard=True, - configured_tensorboard_log_uri=configured_tensorboard_log_uri, - should_write_events=True, - ) - - self.assertIs(created_writer, writer) - mock_create_file_writer.assert_called_once_with( - configured_tensorboard_log_uri.uri - ) - - @patch("gigl.src.common.utils.tensorboard.tf.summary.create_file_writer") - def test_create_tensorboard_writer_skips_non_chief_process( - self, - mock_create_file_writer, + @patch("gigl.src.common.utils.tensorboard.tf.summary.scalar") + def test_log_writes_each_metric_at_step_and_flushes( + self, mock_summary_scalar ) -> None: - created_writer = create_tensorboard_writer( - should_log_to_tensorboard=True, - configured_tensorboard_log_uri=UriFactory.create_uri( - "gs://perm-assets/job/trainer/logs/" - ), - should_write_events=False, - ) - - self.assertIsNone(created_writer) - mock_create_file_writer.assert_not_called() + underlying_writer = Mock() + underlying_writer.as_default.return_value.__enter__ = Mock(return_value=None) + underlying_writer.as_default.return_value.__exit__ = Mock(return_value=None) + with patch( + "gigl.src.common.utils.tensorboard.tf.summary.create_file_writer", + return_value=underlying_writer, + ): + writer = TensorBoardWriter(log_dir="gs://logs/") + writer.log({"Loss/train": 1.5, "Loss/val": 2.0}, step=10) + + self.assertEqual(mock_summary_scalar.call_count, 2) + mock_summary_scalar.assert_any_call("Loss/train", 1.5, step=10) + mock_summary_scalar.assert_any_call("Loss/val", 2.0, step=10) + underlying_writer.flush.assert_called_once() @patch("gigl.src.common.utils.tensorboard.tf.summary.scalar") - def test_write_tensorboard_scalar_flushes_writer(self, mock_summary_scalar) -> None: - writer = Mock() - writer.as_default.return_value.__enter__ = Mock(return_value=None) - writer.as_default.return_value.__exit__ = Mock(return_value=None) - - write_tensorboard_scalar( - writer=writer, - tag="Loss/train", - value=1.5, - step=10, - ) - - mock_summary_scalar.assert_called_once_with("Loss/train", 1.5, step=10) - writer.flush.assert_called_once() + def test_log_is_noop_when_writer_disabled(self, mock_summary_scalar) -> None: + with patch( + "gigl.src.common.utils.tensorboard.tf.summary.create_file_writer" + ) as mock_create_file_writer: + writer = TensorBoardWriter(log_dir=None) + writer.log({"Loss/train": 1.0}, step=0) + + mock_create_file_writer.assert_not_called() + mock_summary_scalar.assert_not_called() + + def test_context_manager_closes_writer(self) -> None: + underlying_writer = Mock() + with patch( + "gigl.src.common.utils.tensorboard.tf.summary.create_file_writer", + return_value=underlying_writer, + ): + with TensorBoardWriter(log_dir="gs://logs/"): + pass + + underlying_writer.close.assert_called_once() + + def test_close_is_idempotent(self) -> None: + underlying_writer = Mock() + with patch( + "gigl.src.common.utils.tensorboard.tf.summary.create_file_writer", + return_value=underlying_writer, + ): + writer = TensorBoardWriter(log_dir="gs://logs/") + writer.close() + writer.close() + + underlying_writer.close.assert_called_once() + + def test_close_on_noop_writer_does_not_raise(self) -> None: + writer = TensorBoardWriter(log_dir=None) + writer.close() # No exception expected. if __name__ == "__main__": From ddc4fcd57594ec92c5749793863fd26213930d2a Mon Sep 17 00:00:00 2001 From: kmontemayor Date: Wed, 29 Apr 2026 15:52:35 +0000 Subject: [PATCH 04/59] test(tensorboard): cover close idempotency on no-op writer Code review found that test_close_on_noop_writer_does_not_raise only called close() once, so idempotency on the no-op path was untested. Co-Authored-By: Claude Opus 4.7 (1M context) --- tests/unit/src/common/utils/tensorboard_test.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/unit/src/common/utils/tensorboard_test.py b/tests/unit/src/common/utils/tensorboard_test.py index e494dcd0e..19378aa97 100644 --- a/tests/unit/src/common/utils/tensorboard_test.py +++ b/tests/unit/src/common/utils/tensorboard_test.py @@ -114,7 +114,8 @@ def test_close_is_idempotent(self) -> None: def test_close_on_noop_writer_does_not_raise(self) -> None: writer = TensorBoardWriter(log_dir=None) - writer.close() # No exception expected. + writer.close() + writer.close() # Idempotent on no-op writer. if __name__ == "__main__": From a4e1bca5e4b5c8e6029a3e884dd8c014b77bee24 Mon Sep 17 00:00:00 2001 From: kmontemayor Date: Wed, 29 Apr 2026 16:27:12 +0000 Subject: [PATCH 05/59] refactor(tensorboard): replace function API with TensorBoardWriter class Collapse resolve_tensorboard_log_dir, create_tensorboard_writer, write_tensorboard_scalar, close_tensorboard_writer, and VERTEX_TENSORBOARD_LOG_DIR_ENV_KEY into a single TensorBoardWriter class with from_uri classmethod. The class is context-managerable and no-ops when disabled, eliminating Optional plumbing at call sites. Co-Authored-By: Claude Opus 4.7 (1M context) --- gigl/src/common/utils/tensorboard.py | 160 +++++++++++++++------------ 1 file changed, 91 insertions(+), 69 deletions(-) diff --git a/gigl/src/common/utils/tensorboard.py b/gigl/src/common/utils/tensorboard.py index 48905e673..d7fd54c2f 100644 --- a/gigl/src/common/utils/tensorboard.py +++ b/gigl/src/common/utils/tensorboard.py @@ -1,4 +1,4 @@ -"""Shared TensorBoard helpers for GiGL training entrypoints.""" +"""TensorBoard writer for GiGL training entrypoints.""" import os from typing import Any, Optional @@ -7,88 +7,110 @@ from gigl.common import Uri -VERTEX_TENSORBOARD_LOG_DIR_ENV_KEY = "AIP_TENSORBOARD_LOG_DIR" +_VERTEX_TENSORBOARD_LOG_DIR_ENV_KEY = "AIP_TENSORBOARD_LOG_DIR" -def resolve_tensorboard_log_dir( - configured_tensorboard_log_uri: Optional[Uri], -) -> Optional[str]: - """Resolve the TensorBoard log directory for the current runtime. +def _resolve_log_dir(configured_uri: Optional[Uri]) -> Optional[str]: + """Resolve the TensorBoard log directory. - Vertex AI sets ``AIP_TENSORBOARD_LOG_DIR`` when ``baseOutputDirectory`` is - configured on a CustomJob. Outside Vertex AI, GiGL falls back to the - TensorBoard URI stored in the task config. + Vertex AI populates ``AIP_TENSORBOARD_LOG_DIR`` when ``baseOutputDirectory`` + is configured on a CustomJob. Outside Vertex AI, GiGL falls back to the + URI from the task config. Args: - configured_tensorboard_log_uri: The TensorBoard URI from GiGL config. + configured_uri: The TensorBoard URI from GiGL config. Returns: The resolved log directory, or ``None`` when no directory is available. """ - vertex_tensorboard_log_dir = os.environ.get(VERTEX_TENSORBOARD_LOG_DIR_ENV_KEY) - if vertex_tensorboard_log_dir: - return vertex_tensorboard_log_dir - - if configured_tensorboard_log_uri is None: + vertex_log_dir = os.environ.get(_VERTEX_TENSORBOARD_LOG_DIR_ENV_KEY) + if vertex_log_dir: + return vertex_log_dir + if configured_uri is None: return None + return configured_uri.uri - return configured_tensorboard_log_uri.uri +class TensorBoardWriter: + """Writes scalar metrics to TensorBoard. -def create_tensorboard_writer( - should_log_to_tensorboard: bool, - configured_tensorboard_log_uri: Optional[Uri], - should_write_events: bool, -) -> Optional[Any]: - """Create a TensorBoard summary writer when logging is enabled. + No-ops when disabled or when no log directory is available, so callers + never see ``Optional[TensorBoardWriter]`` plumbing. - Args: - should_log_to_tensorboard: Whether TensorBoard logging is enabled. - configured_tensorboard_log_uri: The TensorBoard URI from GiGL config. - should_write_events: Whether the current process should emit events. + The writer flushes after every ``log()`` call so that Vertex's TensorBoard + UI sees events live as training progresses. - Returns: - A TensorBoard writer, or ``None`` when logging should be skipped. + Example: + >>> with TensorBoardWriter.from_uri(uri, enabled=is_chief and should_log) as tb: + ... tb.log({"Loss/train": loss, "Loss/val": vloss}, step=batch_idx) """ - if not should_log_to_tensorboard or not should_write_events: - return None - - tensorboard_log_dir = resolve_tensorboard_log_dir( - configured_tensorboard_log_uri=configured_tensorboard_log_uri - ) - if tensorboard_log_dir is None: - return None - - return tf.summary.create_file_writer(tensorboard_log_dir) - - -def write_tensorboard_scalar( - writer: Optional[Any], - tag: str, - value: float, - step: int, -) -> None: - """Write a scalar TensorBoard event when a writer is available. - Args: - writer: TensorBoard writer created by ``create_tensorboard_writer``. - tag: The TensorBoard series name. - value: Scalar value to log. - step: TensorBoard step for the event. - """ - if writer is None: - return - - with writer.as_default(): - tf.summary.scalar(tag, value, step=step) - writer.flush() - - -def close_tensorboard_writer(writer: Optional[Any]) -> None: - """Close a TensorBoard writer when one exists. - - Args: - writer: TensorBoard writer created by ``create_tensorboard_writer``. - """ - if writer is not None: - writer.close() + def __init__(self, log_dir: Optional[str]) -> None: + """Initialize the writer. + + Args: + log_dir: Destination directory for TensorBoard events. When + ``None``, the writer is a no-op and allocates no TF resources. + """ + self._writer: Optional[Any] = ( + tf.summary.create_file_writer(log_dir) if log_dir else None + ) + self._closed = False + + @classmethod + def from_uri( + cls, + configured_uri: Optional[Uri], + *, + enabled: bool = True, + ) -> "TensorBoardWriter": + """Build a writer with Vertex AI env-var precedence. + + When ``enabled`` is ``False``, returns a no-op writer without reading + the environment or the configured URI. + + Args: + configured_uri: The TensorBoard URI from GiGL config. Used only + when ``AIP_TENSORBOARD_LOG_DIR`` is unset. + enabled: Whether this caller is responsible for writing events. + Typically ``should_log_to_tensorboard and is_chief_process``. + + Returns: + A ``TensorBoardWriter`` instance — real if enabled and a log + directory was resolved, no-op otherwise. + """ + if not enabled: + return cls(log_dir=None) + return cls(log_dir=_resolve_log_dir(configured_uri)) + + def log(self, metrics: dict[str, float], step: int) -> None: + """Write each metric scalar at ``step`` and flush. + + No-ops when the writer is disabled or already closed. + + Args: + metrics: Mapping of TensorBoard tag to scalar value. All entries + are written at the same step. + step: TensorBoard step for the events. + """ + if self._writer is None or self._closed: + return + with self._writer.as_default(): + for tag, value in metrics.items(): + tf.summary.scalar(tag, value, step=step) + self._writer.flush() + + def close(self) -> None: + """Close the underlying TF writer. + + Idempotent; safe to call multiple times and on no-op writers. + """ + if self._writer is not None and not self._closed: + self._writer.close() + self._closed = True + + def __enter__(self) -> "TensorBoardWriter": + return self + + def __exit__(self, *_exc: object) -> None: + self.close() From e8037c5bc8971418d8cb4a8572773aa3f3023373 Mon Sep 17 00:00:00 2001 From: kmontemayor Date: Wed, 29 Apr 2026 16:44:04 +0000 Subject: [PATCH 06/59] refactor(v1-trainer): remove dead tensorboard writer plumbing The V1 BaseTrainer.train body never wrote scalars via TF's ambient default writer, so the file_writer + as_default() block in __run_training served no purpose. Verified with grep: no tf.summary.scalar or write_tensorboard_scalar callers anywhere in gigl/src/training/v1/. Co-Authored-By: Claude Opus 4.7 (1M context) --- gigl/src/training/v1/lib/training_process.py | 34 ++++++-------------- 1 file changed, 10 insertions(+), 24 deletions(-) diff --git a/gigl/src/training/v1/lib/training_process.py b/gigl/src/training/v1/lib/training_process.py index db70147f4..31894a7c3 100644 --- a/gigl/src/training/v1/lib/training_process.py +++ b/gigl/src/training/v1/lib/training_process.py @@ -46,7 +46,6 @@ initialize_metrics, ) from gigl.src.common.utils.model import load_state_dict_from_uri -from gigl.src.common.utils.tensorboard import create_tensorboard_writer from gigl.src.common.utils.time import current_formatted_datetime from gigl.src.training.v1.lib.base_trainer import BaseTrainer @@ -216,33 +215,20 @@ def __run_training( ): trainer_instance.setup_for_training() logger.info(f"Starting training at {current_formatted_datetime()}") - tensorboard_log_uri = ( - gbml_config_pb_wrapper.shared_config.trained_model_metadata.tensorboard_logs_uri - ) profiler = get_torch_profiler_instance( gbml_config_pb_wrapper=gbml_config_pb_wrapper ) - configured_tensorboard_log_uri = ( - UriFactory.create_uri(tensorboard_log_uri) if tensorboard_log_uri else None - ) - file_writer = create_tensorboard_writer( - should_log_to_tensorboard=gbml_config_pb_wrapper.trainer_config.should_log_to_tensorboard, - configured_tensorboard_log_uri=configured_tensorboard_log_uri, - should_write_events=get_rank() == 0, - ) - - with file_writer.as_default() if file_writer else contextlib.nullcontext(): - with ( - profiler.profiler_context() # type: ignore[attr-defined] - if profiler - else contextlib.nullcontext() as prof - ): - trainer_instance.train( - gbml_config_pb_wrapper=gbml_config_pb_wrapper, - device=device, - profiler=prof, - ) + with ( + profiler.profiler_context() # type: ignore[attr-defined] + if profiler + else contextlib.nullcontext() as prof + ): + trainer_instance.train( + gbml_config_pb_wrapper=gbml_config_pb_wrapper, + device=device, + profiler=prof, + ) if profiler: if does_path_exist(TMP_PROFILER_LOG_DIR_NAME): file_loader = FileLoader() From d6e1f9f77f09db226a3c531a0c97c5489ea6fc7d Mon Sep 17 00:00:00 2001 From: kmontemayor Date: Wed, 29 Apr 2026 16:51:30 +0000 Subject: [PATCH 07/59] refactor(examples): migrate homogeneous_training.py to TensorBoardWriter Replace function-based tensorboard helpers with the new TensorBoardWriter class. Collapses two back-to-back scalar writes per log step into a single dict-style log() call. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../link_prediction/homogeneous_training.py | 48 ++++++------------- 1 file changed, 14 insertions(+), 34 deletions(-) diff --git a/examples/link_prediction/homogeneous_training.py b/examples/link_prediction/homogeneous_training.py index adfe8dff3..1c50af1f3 100644 --- a/examples/link_prediction/homogeneous_training.py +++ b/examples/link_prediction/homogeneous_training.py @@ -58,11 +58,7 @@ ) from gigl.src.common.types.pb_wrappers.gbml_config import GbmlConfigPbWrapper from gigl.src.common.utils.model import load_state_dict_from_uri, save_state_dict -from gigl.src.common.utils.tensorboard import ( - close_tensorboard_writer, - create_tensorboard_writer, - write_tensorboard_scalar, -) +from gigl.src.common.utils.tensorboard import TensorBoardWriter from gigl.types.graph import to_homogeneous from gigl.utils.iterator import InfiniteIterator from gigl.utils.sampling import parse_fanout @@ -369,10 +365,9 @@ def _training_process( logger.info(f"---Rank {rank} training process group initialized") is_chief_process = args.machine_rank == 0 and local_rank == 0 - tensorboard_writer = create_tensorboard_writer( - should_log_to_tensorboard=args.should_log_to_tensorboard, - configured_tensorboard_log_uri=args.tensorboard_log_uri, - should_write_events=is_chief_process, + tensorboard_writer = TensorBoardWriter.from_uri( + args.tensorboard_log_uri, + enabled=args.should_log_to_tensorboard and is_chief_process, ) loss_fn = RetrievalLoss( @@ -494,10 +489,11 @@ def _training_process( logger.info( f"rank={rank}, mean(batch_time)={mean_batch_time:.3f} sec, max(batch_time)={max(last_n_batch_time):.3f} sec, min(batch_time)={min(last_n_batch_time):.3f} sec" ) - write_tensorboard_scalar( - writer=tensorboard_writer, - tag="Time/batch_mean_sec", - value=mean_batch_time, + tensorboard_writer.log( + { + "Time/batch_mean_sec": mean_batch_time, + "Loss/train": mean_train_loss, + }, step=batch_idx, ) last_n_batch_time.clear() @@ -505,12 +501,6 @@ def _training_process( logger.info( f"rank={rank}, latest avg_train_loss={avg_train_loss:.6f}, last {args.log_every_n_batch} mean(avg_train_loss)={mean_train_loss:.6f}" ) - write_tensorboard_scalar( - writer=tensorboard_writer, - tag="Loss/train", - value=mean_train_loss, - step=batch_idx, - ) last_n_batch_avg_loss.clear() if batch_idx % args.val_every_n_batch == 0: @@ -525,11 +515,8 @@ def _training_process( log_every_n_batch=args.log_every_n_batch, num_batches=num_val_batches_per_process, ) - write_tensorboard_scalar( - writer=tensorboard_writer, - tag="Loss/val", - value=global_avg_val_loss, - step=batch_idx, + tensorboard_writer.log( + {"Loss/val": global_avg_val_loss}, step=batch_idx ) model.train() @@ -608,12 +595,7 @@ def _training_process( device=device, log_every_n_batch=args.log_every_n_batch, ) - write_tensorboard_scalar( - writer=tensorboard_writer, - tag="Loss/test", - value=global_avg_test_loss, - step=batch_idx, - ) + tensorboard_writer.log({"Loss/test": global_avg_test_loss}, step=batch_idx) # Memory cleanup and waiting for all processes to finish if torch.cuda.is_available(): @@ -643,7 +625,7 @@ def _training_process( logger.info( f"---Rank {rank} finished testing in {time.time() - testing_start_time:.3f} seconds" ) - close_tensorboard_writer(tensorboard_writer) + tensorboard_writer.close() torch.distributed.destroy_process_group() @@ -859,9 +841,7 @@ def _run_example_training( eval_metrics_uri: Optional[Uri] = ( UriFactory.create_uri(raw_eval_metrics_uri) if raw_eval_metrics_uri else None ) - raw_tensorboard_log_uri = ( - gbml_config_pb_wrapper.gbml_config_pb.shared_config.trained_model_metadata.tensorboard_logs_uri - ) + raw_tensorboard_log_uri = gbml_config_pb_wrapper.gbml_config_pb.shared_config.trained_model_metadata.tensorboard_logs_uri tensorboard_log_uri: Optional[Uri] = ( UriFactory.create_uri(raw_tensorboard_log_uri) if raw_tensorboard_log_uri From 2018f234cc12afe5f99173f2845efb0e7029460e Mon Sep 17 00:00:00 2001 From: kmontemayor Date: Wed, 29 Apr 2026 16:55:22 +0000 Subject: [PATCH 08/59] refactor(examples): migrate heterogeneous_training.py to TensorBoardWriter Co-Authored-By: Claude Opus 4.7 (1M context) --- .../link_prediction/heterogeneous_training.py | 48 ++++++------------- 1 file changed, 14 insertions(+), 34 deletions(-) diff --git a/examples/link_prediction/heterogeneous_training.py b/examples/link_prediction/heterogeneous_training.py index bc43770c5..cb136eeb1 100644 --- a/examples/link_prediction/heterogeneous_training.py +++ b/examples/link_prediction/heterogeneous_training.py @@ -63,11 +63,7 @@ ) from gigl.src.common.types.pb_wrappers.gbml_config import GbmlConfigPbWrapper from gigl.src.common.utils.model import load_state_dict_from_uri, save_state_dict -from gigl.src.common.utils.tensorboard import ( - close_tensorboard_writer, - create_tensorboard_writer, - write_tensorboard_scalar, -) +from gigl.src.common.utils.tensorboard import TensorBoardWriter from gigl.utils.iterator import InfiniteIterator from gigl.utils.sampling import parse_fanout @@ -410,10 +406,9 @@ def _training_process( torch.cuda.set_device(device) logger.info(f"---Rank {rank} training process set device {device}") is_chief_process = args.machine_rank == 0 and local_rank == 0 - tensorboard_writer = create_tensorboard_writer( - should_log_to_tensorboard=args.should_log_to_tensorboard, - configured_tensorboard_log_uri=args.tensorboard_log_uri, - should_write_events=is_chief_process, + tensorboard_writer = TensorBoardWriter.from_uri( + args.tensorboard_log_uri, + enabled=args.should_log_to_tensorboard and is_chief_process, ) loss_fn = RetrievalLoss( loss=torch.nn.CrossEntropyLoss(reduction="mean"), @@ -535,10 +530,11 @@ def _training_process( logger.info( f"rank={rank}, batch={batch_idx}, mean(batch_time)={mean_batch_time:.3f} sec, max(batch_time)={max(last_n_batch_time):.3f} sec, min(batch_time)={min(last_n_batch_time):.3f} sec" ) - write_tensorboard_scalar( - writer=tensorboard_writer, - tag="Time/batch_mean_sec", - value=mean_batch_time, + tensorboard_writer.log( + { + "Time/batch_mean_sec": mean_batch_time, + "Loss/train": mean_train_loss, + }, step=batch_idx, ) last_n_batch_time.clear() @@ -546,12 +542,6 @@ def _training_process( logger.info( f"rank={rank}, latest avg_train_loss={avg_train_loss:.6f}, last {args.log_every_n_batch} mean(avg_train_loss)={mean_train_loss:.6f}" ) - write_tensorboard_scalar( - writer=tensorboard_writer, - tag="Loss/train", - value=mean_train_loss, - step=batch_idx, - ) last_n_batch_avg_loss.clear() if batch_idx % args.val_every_n_batch == 0: @@ -567,11 +557,8 @@ def _training_process( log_every_n_batch=args.log_every_n_batch, num_batches=num_val_batches_per_process, ) - write_tensorboard_scalar( - writer=tensorboard_writer, - tag="Loss/val", - value=global_avg_val_loss, - step=batch_idx, + tensorboard_writer.log( + {"Loss/val": global_avg_val_loss}, step=batch_idx ) model.train() @@ -654,12 +641,7 @@ def _training_process( device=device, log_every_n_batch=args.log_every_n_batch, ) - write_tensorboard_scalar( - writer=tensorboard_writer, - tag="Loss/test", - value=global_avg_test_loss, - step=batch_idx, - ) + tensorboard_writer.log({"Loss/test": global_avg_test_loss}, step=batch_idx) # Memory cleanup and waiting for all processes to finish if torch.cuda.is_available(): @@ -689,7 +671,7 @@ def _training_process( logger.info( f"---Rank {rank} finished testing in {time.time() - testing_start_time:.3f} seconds" ) - close_tensorboard_writer(tensorboard_writer) + tensorboard_writer.close() torch.distributed.destroy_process_group() @@ -923,9 +905,7 @@ def _run_example_training( eval_metrics_uri: Optional[Uri] = ( UriFactory.create_uri(raw_eval_metrics_uri) if raw_eval_metrics_uri else None ) - raw_tensorboard_log_uri = ( - gbml_config_pb_wrapper.gbml_config_pb.shared_config.trained_model_metadata.tensorboard_logs_uri - ) + raw_tensorboard_log_uri = gbml_config_pb_wrapper.gbml_config_pb.shared_config.trained_model_metadata.tensorboard_logs_uri tensorboard_log_uri: Optional[Uri] = ( UriFactory.create_uri(raw_tensorboard_log_uri) if raw_tensorboard_log_uri From 89939de25c4b71b1e231b6009b0770a7784eb8de Mon Sep 17 00:00:00 2001 From: kmontemayor Date: Wed, 29 Apr 2026 17:01:42 +0000 Subject: [PATCH 09/59] refactor(examples): migrate graph_store/homogeneous_training.py to TensorBoardWriter Co-Authored-By: Claude Opus 4.7 (1M context) --- .../graph_store/homogeneous_training.py | 49 ++++++------------- 1 file changed, 15 insertions(+), 34 deletions(-) diff --git a/examples/link_prediction/graph_store/homogeneous_training.py b/examples/link_prediction/graph_store/homogeneous_training.py index ec1ad7b42..ea36f9e0d 100644 --- a/examples/link_prediction/graph_store/homogeneous_training.py +++ b/examples/link_prediction/graph_store/homogeneous_training.py @@ -157,11 +157,7 @@ ) from gigl.src.common.types.pb_wrappers.gbml_config import GbmlConfigPbWrapper from gigl.src.common.utils.model import load_state_dict_from_uri, save_state_dict -from gigl.src.common.utils.tensorboard import ( - close_tensorboard_writer, - create_tensorboard_writer, - write_tensorboard_scalar, -) +from gigl.src.common.utils.tensorboard import TensorBoardWriter from gigl.utils.iterator import InfiniteIterator from gigl.utils.sampling import parse_fanout @@ -459,10 +455,9 @@ def _training_process( if torch.cuda.is_available(): torch.cuda.set_device(device) logger.info(f"---Rank {rank} training process set device {device}") - tensorboard_writer = create_tensorboard_writer( - should_log_to_tensorboard=args.should_log_to_tensorboard, - configured_tensorboard_log_uri=args.tensorboard_log_uri, - should_write_events=rank == 0, + tensorboard_writer = TensorBoardWriter.from_uri( + args.tensorboard_log_uri, + enabled=args.should_log_to_tensorboard and rank == 0, ) loss_fn = RetrievalLoss( @@ -579,22 +574,18 @@ def _training_process( logger.info( f"rank={rank}, mean(batch_time)={mean_batch_time:.3f} sec, max(batch_time)={max(last_n_batch_time):.3f} sec, min(batch_time)={min(last_n_batch_time):.3f} sec" ) - write_tensorboard_scalar( - writer=tensorboard_writer, - tag="Time/batch_mean_sec", - value=mean_batch_time, + tensorboard_writer.log( + { + "Time/batch_mean_sec": mean_batch_time, + "Loss/train": mean_train_loss, + }, step=batch_idx, ) last_n_batch_time.clear() + # log the global average training loss logger.info( f"rank={rank}, latest avg_train_loss={avg_train_loss:.6f}, last {args.log_every_n_batch} mean(avg_train_loss)={mean_train_loss:.6f}" ) - write_tensorboard_scalar( - writer=tensorboard_writer, - tag="Loss/train", - value=mean_train_loss, - step=batch_idx, - ) last_n_batch_avg_loss.clear() flush() @@ -610,11 +601,8 @@ def _training_process( log_every_n_batch=args.log_every_n_batch, num_batches=num_val_batches_per_process, ) - write_tensorboard_scalar( - writer=tensorboard_writer, - tag="Loss/val", - value=global_avg_val_loss, - step=batch_idx, + tensorboard_writer.log( + {"Loss/val": global_avg_val_loss}, step=batch_idx ) model.train() @@ -691,12 +679,7 @@ def _training_process( device=device, log_every_n_batch=args.log_every_n_batch, ) - write_tensorboard_scalar( - writer=tensorboard_writer, - tag="Loss/test", - value=global_avg_test_loss, - step=batch_idx, - ) + tensorboard_writer.log({"Loss/test": global_avg_test_loss}, step=batch_idx) # Memory cleanup and waiting for all processes to finish if torch.cuda.is_available(): @@ -724,7 +707,7 @@ def _training_process( f"---Rank {rank} finished testing in {time.time() - testing_start_time:.3f} seconds" ) flush() - close_tensorboard_writer(tensorboard_writer) + tensorboard_writer.close() # Graph store mode cleanup: shutdown the compute process connection to the storage cluster. shutdown_compute_proccess() @@ -954,9 +937,7 @@ def _run_example_training( eval_metrics_uri: Optional[Uri] = ( UriFactory.create_uri(raw_eval_metrics_uri) if raw_eval_metrics_uri else None ) - raw_tensorboard_log_uri = ( - gbml_config_pb_wrapper.gbml_config_pb.shared_config.trained_model_metadata.tensorboard_logs_uri - ) + raw_tensorboard_log_uri = gbml_config_pb_wrapper.gbml_config_pb.shared_config.trained_model_metadata.tensorboard_logs_uri tensorboard_log_uri: Optional[Uri] = ( UriFactory.create_uri(raw_tensorboard_log_uri) if raw_tensorboard_log_uri From d9817f42b4097902ff30c2502d7c58263de0ee23 Mon Sep 17 00:00:00 2001 From: kmontemayor Date: Wed, 29 Apr 2026 17:04:58 +0000 Subject: [PATCH 10/59] refactor(examples): migrate graph_store/heterogeneous_training.py to TensorBoardWriter Co-Authored-By: Claude Opus 4.7 (1M context) --- .../graph_store/heterogeneous_training.py | 49 ++++++------------- 1 file changed, 15 insertions(+), 34 deletions(-) diff --git a/examples/link_prediction/graph_store/heterogeneous_training.py b/examples/link_prediction/graph_store/heterogeneous_training.py index bbfeb018d..a6ec12bb4 100644 --- a/examples/link_prediction/graph_store/heterogeneous_training.py +++ b/examples/link_prediction/graph_store/heterogeneous_training.py @@ -113,11 +113,7 @@ ) from gigl.src.common.types.pb_wrappers.gbml_config import GbmlConfigPbWrapper from gigl.src.common.utils.model import load_state_dict_from_uri, save_state_dict -from gigl.src.common.utils.tensorboard import ( - close_tensorboard_writer, - create_tensorboard_writer, - write_tensorboard_scalar, -) +from gigl.src.common.utils.tensorboard import TensorBoardWriter from gigl.utils.iterator import InfiniteIterator from gigl.utils.sampling import parse_fanout @@ -468,10 +464,9 @@ def _training_process( if torch.cuda.is_available(): torch.cuda.set_device(device) print(f"---Rank {rank} training process set device {device}") - tensorboard_writer = create_tensorboard_writer( - should_log_to_tensorboard=args.should_log_to_tensorboard, - configured_tensorboard_log_uri=args.tensorboard_log_uri, - should_write_events=rank == 0, + tensorboard_writer = TensorBoardWriter.from_uri( + args.tensorboard_log_uri, + enabled=args.should_log_to_tensorboard and rank == 0, ) loss_fn = RetrievalLoss( @@ -591,22 +586,18 @@ def _training_process( print( f"rank={rank}, batch={batch_idx}, mean(batch_time)={mean_batch_time:.3f} sec, max(batch_time)={max(last_n_batch_time):.3f} sec, min(batch_time)={min(last_n_batch_time):.3f} sec" ) - write_tensorboard_scalar( - writer=tensorboard_writer, - tag="Time/batch_mean_sec", - value=mean_batch_time, + tensorboard_writer.log( + { + "Time/batch_mean_sec": mean_batch_time, + "Loss/train": mean_train_loss, + }, step=batch_idx, ) last_n_batch_time.clear() + # log the global average training loss print( f"rank={rank}, latest avg_train_loss={avg_train_loss:.6f}, last {args.log_every_n_batch} mean(avg_train_loss)={mean_train_loss:.6f}" ) - write_tensorboard_scalar( - writer=tensorboard_writer, - tag="Loss/train", - value=mean_train_loss, - step=batch_idx, - ) last_n_batch_avg_loss.clear() flush() @@ -624,11 +615,8 @@ def _training_process( log_every_n_batch=args.log_every_n_batch, num_batches=num_val_batches_per_process, ) - write_tensorboard_scalar( - writer=tensorboard_writer, - tag="Loss/val", - value=global_avg_val_loss, - step=batch_idx, + tensorboard_writer.log( + {"Loss/val": global_avg_val_loss}, step=batch_idx ) model.train() else: @@ -708,12 +696,7 @@ def _training_process( device=device, log_every_n_batch=args.log_every_n_batch, ) - write_tensorboard_scalar( - writer=tensorboard_writer, - tag="Loss/test", - value=global_avg_test_loss, - step=batch_idx, - ) + tensorboard_writer.log({"Loss/test": global_avg_test_loss}, step=batch_idx) # Memory cleanup and waiting for all processes to finish if torch.cuda.is_available(): @@ -741,7 +724,7 @@ def _training_process( f"---Rank {rank} finished testing in {time.time() - testing_start_time:.3f} seconds" ) flush() - close_tensorboard_writer(tensorboard_writer) + tensorboard_writer.close() # Graph store mode cleanup: shutdown the compute process connection to the storage cluster. shutdown_compute_proccess() @@ -967,9 +950,7 @@ def _run_example_training( eval_metrics_uri: Optional[Uri] = ( UriFactory.create_uri(raw_eval_metrics_uri) if raw_eval_metrics_uri else None ) - raw_tensorboard_log_uri = ( - gbml_config_pb_wrapper.gbml_config_pb.shared_config.trained_model_metadata.tensorboard_logs_uri - ) + raw_tensorboard_log_uri = gbml_config_pb_wrapper.gbml_config_pb.shared_config.trained_model_metadata.tensorboard_logs_uri tensorboard_log_uri: Optional[Uri] = ( UriFactory.create_uri(raw_tensorboard_log_uri) if raw_tensorboard_log_uri From fe879dcaff742628d4bb1539b28710157edba42e Mon Sep 17 00:00:00 2001 From: kmontemayor Date: Wed, 29 Apr 2026 17:43:30 +0000 Subject: [PATCH 11/59] cleanup --- gigl/src/training/v1/trainer.py | 4 +--- gigl/src/training/v2/glt_trainer.py | 4 +--- .../libs/gbml_and_resource_config_compatibility_checks.py | 4 +++- tests/unit/src/common/vertex_ai_test.py | 2 +- .../config_populator_functionality_test.py | 8 ++++++-- 5 files changed, 12 insertions(+), 10 deletions(-) diff --git a/gigl/src/training/v1/trainer.py b/gigl/src/training/v1/trainer.py index 0d4e24b08..cdfbf4748 100644 --- a/gigl/src/training/v1/trainer.py +++ b/gigl/src/training/v1/trainer.py @@ -49,9 +49,7 @@ def run( gbml_config_uri=task_config_uri ) ) - raw_tensorboard_logs_uri = ( - gbml_config_pb_wrapper.shared_config.trained_model_metadata.tensorboard_logs_uri - ) + raw_tensorboard_logs_uri = gbml_config_pb_wrapper.shared_config.trained_model_metadata.tensorboard_logs_uri tensorboard_logs_uri = ( UriFactory.create_uri(raw_tensorboard_logs_uri) if gbml_config_pb_wrapper.trainer_config.should_log_to_tensorboard diff --git a/gigl/src/training/v2/glt_trainer.py b/gigl/src/training/v2/glt_trainer.py index eb50bc0f2..15e225b92 100644 --- a/gigl/src/training/v2/glt_trainer.py +++ b/gigl/src/training/v2/glt_trainer.py @@ -54,9 +54,7 @@ def __execute_VAI_training( training_process_runtime_args = ( gbml_config_pb_wrapper.trainer_config.trainer_args ) - raw_tensorboard_logs_uri = ( - gbml_config_pb_wrapper.shared_config.trained_model_metadata.tensorboard_logs_uri - ) + raw_tensorboard_logs_uri = gbml_config_pb_wrapper.shared_config.trained_model_metadata.tensorboard_logs_uri tensorboard_logs_uri = ( UriFactory.create_uri(raw_tensorboard_logs_uri) if gbml_config_pb_wrapper.trainer_config.should_log_to_tensorboard diff --git a/gigl/src/validation_check/libs/gbml_and_resource_config_compatibility_checks.py b/gigl/src/validation_check/libs/gbml_and_resource_config_compatibility_checks.py index ee6422c3b..9e8d854a3 100644 --- a/gigl/src/validation_check/libs/gbml_and_resource_config_compatibility_checks.py +++ b/gigl/src/validation_check/libs/gbml_and_resource_config_compatibility_checks.py @@ -124,7 +124,9 @@ def check_vertex_ai_trainer_tensorboard_compatibility( return trainer_resource_config = resource_config_wrapper.trainer_config - if isinstance(trainer_resource_config, gigl_resource_config_pb2.VertexAiResourceConfig): + if isinstance( + trainer_resource_config, gigl_resource_config_pb2.VertexAiResourceConfig + ): tensorboard_resource_name = trainer_resource_config.tensorboard_resource_name elif isinstance( trainer_resource_config, gigl_resource_config_pb2.VertexAiGraphStoreConfig diff --git a/tests/unit/src/common/vertex_ai_test.py b/tests/unit/src/common/vertex_ai_test.py index 8047b35f3..2ffbc8c26 100644 --- a/tests/unit/src/common/vertex_ai_test.py +++ b/tests/unit/src/common/vertex_ai_test.py @@ -4,7 +4,7 @@ from absl.testing import absltest -from gigl.common.services.vertex_ai import VertexAIService, VertexAiJobConfig +from gigl.common.services.vertex_ai import VertexAiJobConfig, VertexAIService from tests.test_assets.test_case import TestCase diff --git a/tests/unit/src/config_populator/config_populator_functionality_test.py b/tests/unit/src/config_populator/config_populator_functionality_test.py index 4ce71d1af..201dac5b8 100644 --- a/tests/unit/src/config_populator/config_populator_functionality_test.py +++ b/tests/unit/src/config_populator/config_populator_functionality_test.py @@ -101,7 +101,9 @@ def test_sgs_config_population_is_accurate( ) self.assertNotEqual(trained_model_metadata_pb.trained_model_uri, "") self.assertNotEqual(trained_model_metadata_pb.scripted_model_uri, "") - self.assertTrue(trained_model_metadata_pb.tensorboard_logs_uri.endswith("/logs/")) + self.assertTrue( + trained_model_metadata_pb.tensorboard_logs_uri.endswith("/logs/") + ) # Assert inference metadata assets were set inference_metadata_pb: inference_metadata_pb2.InferenceMetadata = ( @@ -190,7 +192,9 @@ def test_glt_config_population_is_accurate( ) self.assertNotEqual(trained_model_metadata_pb.trained_model_uri, "") self.assertNotEqual(trained_model_metadata_pb.scripted_model_uri, "") - self.assertTrue(trained_model_metadata_pb.tensorboard_logs_uri.endswith("/logs/")) + self.assertTrue( + trained_model_metadata_pb.tensorboard_logs_uri.endswith("/logs/") + ) # Assert inference metadata assets were set inference_metadata_pb: inference_metadata_pb2.InferenceMetadata = ( From 8ff30127059eeb0eaca485db21c810866aec156e Mon Sep 17 00:00:00 2001 From: kmontemayor Date: Wed, 29 Apr 2026 18:09:02 +0000 Subject: [PATCH 12/59] chore(examples): drop shouldLogToTensorboard: true from task configs The default for the proto bool field is false, so removing the line preserves the same behavior. Avoids implying that tensorboard logging should be on by default in example task configs. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../link_prediction/configs/e2e_het_dblp_sup_task_config.yaml | 1 - .../link_prediction/configs/e2e_hom_cora_sup_task_config.yaml | 1 - .../graph_store/configs/e2e_het_dblp_sup_gs_task_config.yaml | 1 - .../graph_store/configs/e2e_hom_cora_sup_gs_task_config.yaml | 1 - 4 files changed, 4 deletions(-) diff --git a/examples/link_prediction/configs/e2e_het_dblp_sup_task_config.yaml b/examples/link_prediction/configs/e2e_het_dblp_sup_task_config.yaml index 3d4024c79..8531fd081 100644 --- a/examples/link_prediction/configs/e2e_het_dblp_sup_task_config.yaml +++ b/examples/link_prediction/configs/e2e_het_dblp_sup_task_config.yaml @@ -30,7 +30,6 @@ datasetConfig: # This argument is specific for the `PassthroughPreprocessorConfigForMockedAssets` preprocessor to indicate which dataset we should be using mocked_dataset_name: 'dblp_node_anchor_edge_features_lp' trainerConfig: - shouldLogToTensorboard: true trainerArgs: # Example argument to trainer log_every_n_batch: "50" diff --git a/examples/link_prediction/configs/e2e_hom_cora_sup_task_config.yaml b/examples/link_prediction/configs/e2e_hom_cora_sup_task_config.yaml index 845e7a9c8..606f13c29 100644 --- a/examples/link_prediction/configs/e2e_hom_cora_sup_task_config.yaml +++ b/examples/link_prediction/configs/e2e_hom_cora_sup_task_config.yaml @@ -13,7 +13,6 @@ datasetConfig: # This argument is specific for the `PassthroughPreprocessorConfigForMockedAssets` preprocessor to indicate which dataset we should be using mocked_dataset_name: 'cora_homogeneous_node_anchor_edge_features_user_defined_labels' trainerConfig: - shouldLogToTensorboard: true trainerArgs: # Example argument to trainer log_every_n_batch: "50" # Frequency in which we log batch information diff --git a/examples/link_prediction/graph_store/configs/e2e_het_dblp_sup_gs_task_config.yaml b/examples/link_prediction/graph_store/configs/e2e_het_dblp_sup_gs_task_config.yaml index 36fc48ea6..7c23186c7 100644 --- a/examples/link_prediction/graph_store/configs/e2e_het_dblp_sup_gs_task_config.yaml +++ b/examples/link_prediction/graph_store/configs/e2e_het_dblp_sup_gs_task_config.yaml @@ -30,7 +30,6 @@ datasetConfig: # This argument is specific for the `PassthroughPreprocessorConfigForMockedAssets` preprocessor to indicate which dataset we should be using mocked_dataset_name: 'dblp_node_anchor_edge_features_lp' trainerConfig: - shouldLogToTensorboard: true trainerArgs: # Example argument to trainer log_every_n_batch: "50" diff --git a/examples/link_prediction/graph_store/configs/e2e_hom_cora_sup_gs_task_config.yaml b/examples/link_prediction/graph_store/configs/e2e_hom_cora_sup_gs_task_config.yaml index faf4316b7..2283a2f91 100644 --- a/examples/link_prediction/graph_store/configs/e2e_hom_cora_sup_gs_task_config.yaml +++ b/examples/link_prediction/graph_store/configs/e2e_hom_cora_sup_gs_task_config.yaml @@ -16,7 +16,6 @@ datasetConfig: # This argument is specific for the `PassthroughPreprocessorConfigForMockedAssets` preprocessor to indicate which dataset we should be using mocked_dataset_name: 'cora_homogeneous_node_anchor_edge_features_user_defined_labels' trainerConfig: - shouldLogToTensorboard: true trainerArgs: # Example argument to trainer log_every_n_batch: "50" # Frequency in which we log batch information From f811f0ac7be183540e897c1963c82b272b3d56d7 Mon Sep 17 00:00:00 2001 From: kmontemayor Date: Wed, 29 Apr 2026 18:12:15 +0000 Subject: [PATCH 13/59] refactor(tensorboard): move TensorBoardWriter to gigl/utils Relocate from gigl/src/common/utils/tensorboard.py (internal pipeline utilities) to gigl/utils/tensorboard_writer.py (general-purpose user utilities). Update the test file path, the in-test patch paths, and all four example training scripts to import from the new location. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../graph_store/heterogeneous_training.py | 2 +- .../graph_store/homogeneous_training.py | 2 +- .../link_prediction/heterogeneous_training.py | 2 +- .../link_prediction/homogeneous_training.py | 2 +- .../tensorboard_writer.py} | 0 .../tensorboard_writer_test.py} | 24 +++++++++---------- 6 files changed, 16 insertions(+), 16 deletions(-) rename gigl/{src/common/utils/tensorboard.py => utils/tensorboard_writer.py} (100%) rename tests/unit/{src/common/utils/tensorboard_test.py => utils/tensorboard_writer_test.py} (82%) diff --git a/examples/link_prediction/graph_store/heterogeneous_training.py b/examples/link_prediction/graph_store/heterogeneous_training.py index a6ec12bb4..f7de87240 100644 --- a/examples/link_prediction/graph_store/heterogeneous_training.py +++ b/examples/link_prediction/graph_store/heterogeneous_training.py @@ -113,7 +113,7 @@ ) from gigl.src.common.types.pb_wrappers.gbml_config import GbmlConfigPbWrapper from gigl.src.common.utils.model import load_state_dict_from_uri, save_state_dict -from gigl.src.common.utils.tensorboard import TensorBoardWriter +from gigl.utils.tensorboard_writer import TensorBoardWriter from gigl.utils.iterator import InfiniteIterator from gigl.utils.sampling import parse_fanout diff --git a/examples/link_prediction/graph_store/homogeneous_training.py b/examples/link_prediction/graph_store/homogeneous_training.py index ea36f9e0d..45042cb58 100644 --- a/examples/link_prediction/graph_store/homogeneous_training.py +++ b/examples/link_prediction/graph_store/homogeneous_training.py @@ -157,7 +157,7 @@ ) from gigl.src.common.types.pb_wrappers.gbml_config import GbmlConfigPbWrapper from gigl.src.common.utils.model import load_state_dict_from_uri, save_state_dict -from gigl.src.common.utils.tensorboard import TensorBoardWriter +from gigl.utils.tensorboard_writer import TensorBoardWriter from gigl.utils.iterator import InfiniteIterator from gigl.utils.sampling import parse_fanout diff --git a/examples/link_prediction/heterogeneous_training.py b/examples/link_prediction/heterogeneous_training.py index cb136eeb1..e5d105eb1 100644 --- a/examples/link_prediction/heterogeneous_training.py +++ b/examples/link_prediction/heterogeneous_training.py @@ -63,7 +63,7 @@ ) from gigl.src.common.types.pb_wrappers.gbml_config import GbmlConfigPbWrapper from gigl.src.common.utils.model import load_state_dict_from_uri, save_state_dict -from gigl.src.common.utils.tensorboard import TensorBoardWriter +from gigl.utils.tensorboard_writer import TensorBoardWriter from gigl.utils.iterator import InfiniteIterator from gigl.utils.sampling import parse_fanout diff --git a/examples/link_prediction/homogeneous_training.py b/examples/link_prediction/homogeneous_training.py index 1c50af1f3..da5796cb9 100644 --- a/examples/link_prediction/homogeneous_training.py +++ b/examples/link_prediction/homogeneous_training.py @@ -58,7 +58,7 @@ ) from gigl.src.common.types.pb_wrappers.gbml_config import GbmlConfigPbWrapper from gigl.src.common.utils.model import load_state_dict_from_uri, save_state_dict -from gigl.src.common.utils.tensorboard import TensorBoardWriter +from gigl.utils.tensorboard_writer import TensorBoardWriter from gigl.types.graph import to_homogeneous from gigl.utils.iterator import InfiniteIterator from gigl.utils.sampling import parse_fanout diff --git a/gigl/src/common/utils/tensorboard.py b/gigl/utils/tensorboard_writer.py similarity index 100% rename from gigl/src/common/utils/tensorboard.py rename to gigl/utils/tensorboard_writer.py diff --git a/tests/unit/src/common/utils/tensorboard_test.py b/tests/unit/utils/tensorboard_writer_test.py similarity index 82% rename from tests/unit/src/common/utils/tensorboard_test.py rename to tests/unit/utils/tensorboard_writer_test.py index 19378aa97..79b428adf 100644 --- a/tests/unit/src/common/utils/tensorboard_test.py +++ b/tests/unit/utils/tensorboard_writer_test.py @@ -1,4 +1,4 @@ -"""Unit tests for gigl.src.common.utils.tensorboard.""" +"""Unit tests for gigl.utils.tensorboard_writer.""" import os from unittest.mock import Mock, patch @@ -6,7 +6,7 @@ from absl.testing import absltest from gigl.common import UriFactory -from gigl.src.common.utils.tensorboard import TensorBoardWriter +from gigl.utils.tensorboard_writer import TensorBoardWriter from tests.test_assets.test_case import TestCase @@ -16,7 +16,7 @@ class TestTensorBoardWriter(TestCase): def test_from_uri_returns_noop_when_disabled(self) -> None: configured_uri = UriFactory.create_uri("gs://config/logs/") with patch( - "gigl.src.common.utils.tensorboard.tf.summary.create_file_writer" + "gigl.utils.tensorboard_writer.tf.summary.create_file_writer" ) as mock_create_file_writer: writer = TensorBoardWriter.from_uri(configured_uri, enabled=False) writer.log({"Loss/train": 1.0}, step=0) @@ -32,7 +32,7 @@ def test_from_uri_prefers_vertex_env_var(self) -> None: clear=False, ): with patch( - "gigl.src.common.utils.tensorboard.tf.summary.create_file_writer" + "gigl.utils.tensorboard_writer.tf.summary.create_file_writer" ) as mock_create_file_writer: TensorBoardWriter.from_uri(configured_uri) @@ -42,7 +42,7 @@ def test_from_uri_falls_back_to_configured_uri(self) -> None: configured_uri = UriFactory.create_uri("gs://config/logs/") with patch.dict(os.environ, {}, clear=True): with patch( - "gigl.src.common.utils.tensorboard.tf.summary.create_file_writer" + "gigl.utils.tensorboard_writer.tf.summary.create_file_writer" ) as mock_create_file_writer: TensorBoardWriter.from_uri(configured_uri) @@ -51,7 +51,7 @@ def test_from_uri_falls_back_to_configured_uri(self) -> None: def test_from_uri_returns_noop_when_no_uri_anywhere(self) -> None: with patch.dict(os.environ, {}, clear=True): with patch( - "gigl.src.common.utils.tensorboard.tf.summary.create_file_writer" + "gigl.utils.tensorboard_writer.tf.summary.create_file_writer" ) as mock_create_file_writer: writer = TensorBoardWriter.from_uri(configured_uri=None) writer.log({"Loss/train": 1.0}, step=0) @@ -59,7 +59,7 @@ def test_from_uri_returns_noop_when_no_uri_anywhere(self) -> None: mock_create_file_writer.assert_not_called() - @patch("gigl.src.common.utils.tensorboard.tf.summary.scalar") + @patch("gigl.utils.tensorboard_writer.tf.summary.scalar") def test_log_writes_each_metric_at_step_and_flushes( self, mock_summary_scalar ) -> None: @@ -67,7 +67,7 @@ def test_log_writes_each_metric_at_step_and_flushes( underlying_writer.as_default.return_value.__enter__ = Mock(return_value=None) underlying_writer.as_default.return_value.__exit__ = Mock(return_value=None) with patch( - "gigl.src.common.utils.tensorboard.tf.summary.create_file_writer", + "gigl.utils.tensorboard_writer.tf.summary.create_file_writer", return_value=underlying_writer, ): writer = TensorBoardWriter(log_dir="gs://logs/") @@ -78,10 +78,10 @@ def test_log_writes_each_metric_at_step_and_flushes( mock_summary_scalar.assert_any_call("Loss/val", 2.0, step=10) underlying_writer.flush.assert_called_once() - @patch("gigl.src.common.utils.tensorboard.tf.summary.scalar") + @patch("gigl.utils.tensorboard_writer.tf.summary.scalar") def test_log_is_noop_when_writer_disabled(self, mock_summary_scalar) -> None: with patch( - "gigl.src.common.utils.tensorboard.tf.summary.create_file_writer" + "gigl.utils.tensorboard_writer.tf.summary.create_file_writer" ) as mock_create_file_writer: writer = TensorBoardWriter(log_dir=None) writer.log({"Loss/train": 1.0}, step=0) @@ -92,7 +92,7 @@ def test_log_is_noop_when_writer_disabled(self, mock_summary_scalar) -> None: def test_context_manager_closes_writer(self) -> None: underlying_writer = Mock() with patch( - "gigl.src.common.utils.tensorboard.tf.summary.create_file_writer", + "gigl.utils.tensorboard_writer.tf.summary.create_file_writer", return_value=underlying_writer, ): with TensorBoardWriter(log_dir="gs://logs/"): @@ -103,7 +103,7 @@ def test_context_manager_closes_writer(self) -> None: def test_close_is_idempotent(self) -> None: underlying_writer = Mock() with patch( - "gigl.src.common.utils.tensorboard.tf.summary.create_file_writer", + "gigl.utils.tensorboard_writer.tf.summary.create_file_writer", return_value=underlying_writer, ): writer = TensorBoardWriter(log_dir="gs://logs/") From e97bd6432a03f4b4a982f5e3d4d3b1d044ac8bdd Mon Sep 17 00:00:00 2001 From: kmontemayor Date: Wed, 29 Apr 2026 20:58:41 +0000 Subject: [PATCH 14/59] refactor(examples): drop should_log_to_tensorboard gate Examples now log to TensorBoard whenever a tensorboard_log_uri is set on the gbml config; no separate boolean to flip. Removes the should_log_to_tensorboard field from TrainingProcessArgs (and its docstring entry, extraction from the gbml config, and pass-through to _training_process) in all four link-prediction example training scripts. TensorBoardWriter.from_uri already returns a no-op writer when the URI is None, so a single signal (the URI) is sufficient. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../link_prediction/graph_store/heterogeneous_training.py | 8 +------- .../link_prediction/graph_store/homogeneous_training.py | 8 +------- examples/link_prediction/heterogeneous_training.py | 8 +------- examples/link_prediction/homogeneous_training.py | 8 +------- 4 files changed, 4 insertions(+), 28 deletions(-) diff --git a/examples/link_prediction/graph_store/heterogeneous_training.py b/examples/link_prediction/graph_store/heterogeneous_training.py index f7de87240..76fc9d633 100644 --- a/examples/link_prediction/graph_store/heterogeneous_training.py +++ b/examples/link_prediction/graph_store/heterogeneous_training.py @@ -390,7 +390,6 @@ class TrainingProcessArgs: num_val_batches (int): Number of validation batches across all processes. val_every_n_batch (int): Frequency to run validation during training. log_every_n_batch (int): Frequency to log batch information during training. - should_log_to_tensorboard (bool): If True, emit TensorBoard summaries. should_skip_training (bool): If True, skip training and only run testing. """ @@ -425,7 +424,6 @@ class TrainingProcessArgs: num_val_batches: int val_every_n_batch: int log_every_n_batch: int - should_log_to_tensorboard: bool should_skip_training: bool @@ -466,7 +464,7 @@ def _training_process( print(f"---Rank {rank} training process set device {device}") tensorboard_writer = TensorBoardWriter.from_uri( args.tensorboard_log_uri, - enabled=args.should_log_to_tensorboard and rank == 0, + enabled=rank == 0, ) loss_fn = RetrievalLoss( @@ -957,9 +955,6 @@ def _run_example_training( else None ) - should_log_to_tensorboard = ( - gbml_config_pb_wrapper.trainer_config.should_log_to_tensorboard - ) should_skip_training = gbml_config_pb_wrapper.shared_config.should_skip_training supervision_edge_types = ( @@ -999,7 +994,6 @@ def _run_example_training( num_val_batches=num_val_batches, val_every_n_batch=val_every_n_batch, log_every_n_batch=log_every_n_batch, - should_log_to_tensorboard=should_log_to_tensorboard, should_skip_training=should_skip_training, ) diff --git a/examples/link_prediction/graph_store/homogeneous_training.py b/examples/link_prediction/graph_store/homogeneous_training.py index 45042cb58..b8e683b98 100644 --- a/examples/link_prediction/graph_store/homogeneous_training.py +++ b/examples/link_prediction/graph_store/homogeneous_training.py @@ -385,7 +385,6 @@ class TrainingProcessArgs: num_val_batches (int): Number of validation batches across all processes. val_every_n_batch (int): Frequency to run validation during training. log_every_n_batch (int): Frequency to log batch information during training. - should_log_to_tensorboard (bool): If True, emit TensorBoard summaries. should_skip_training (bool): If True, skip training and only run testing. """ @@ -417,7 +416,6 @@ class TrainingProcessArgs: num_val_batches: int val_every_n_batch: int log_every_n_batch: int - should_log_to_tensorboard: bool should_skip_training: bool @@ -457,7 +455,7 @@ def _training_process( logger.info(f"---Rank {rank} training process set device {device}") tensorboard_writer = TensorBoardWriter.from_uri( args.tensorboard_log_uri, - enabled=args.should_log_to_tensorboard and rank == 0, + enabled=rank == 0, ) loss_fn = RetrievalLoss( @@ -944,9 +942,6 @@ def _run_example_training( else None ) - should_log_to_tensorboard = ( - gbml_config_pb_wrapper.trainer_config.should_log_to_tensorboard - ) should_skip_training = gbml_config_pb_wrapper.shared_config.should_skip_training # Step 4: Spawn training processes @@ -976,7 +971,6 @@ def _run_example_training( num_val_batches=num_val_batches, val_every_n_batch=val_every_n_batch, log_every_n_batch=log_every_n_batch, - should_log_to_tensorboard=should_log_to_tensorboard, should_skip_training=should_skip_training, ) diff --git a/examples/link_prediction/heterogeneous_training.py b/examples/link_prediction/heterogeneous_training.py index e5d105eb1..282c1fd6f 100644 --- a/examples/link_prediction/heterogeneous_training.py +++ b/examples/link_prediction/heterogeneous_training.py @@ -331,7 +331,6 @@ class TrainingProcessArgs: num_val_batches (int): Number of validation batches across all processes. val_every_n_batch (int): Frequency to run validation during training. log_every_n_batch (int): Frequency to log batch information during training. - should_log_to_tensorboard (bool): If True, emit TensorBoard summaries. should_skip_training (bool): If True, skip training and only run testing. """ @@ -370,7 +369,6 @@ class TrainingProcessArgs: num_val_batches: int val_every_n_batch: int log_every_n_batch: int - should_log_to_tensorboard: bool should_skip_training: bool @@ -408,7 +406,7 @@ def _training_process( is_chief_process = args.machine_rank == 0 and local_rank == 0 tensorboard_writer = TensorBoardWriter.from_uri( args.tensorboard_log_uri, - enabled=args.should_log_to_tensorboard and is_chief_process, + enabled=is_chief_process, ) loss_fn = RetrievalLoss( loss=torch.nn.CrossEntropyLoss(reduction="mean"), @@ -912,9 +910,6 @@ def _run_example_training( else None ) - should_log_to_tensorboard = ( - gbml_config_pb_wrapper.trainer_config.should_log_to_tensorboard - ) should_skip_training = gbml_config_pb_wrapper.shared_config.should_skip_training supervision_edge_types = ( @@ -956,7 +951,6 @@ def _run_example_training( num_val_batches=num_val_batches, val_every_n_batch=val_every_n_batch, log_every_n_batch=log_every_n_batch, - should_log_to_tensorboard=should_log_to_tensorboard, should_skip_training=should_skip_training, ) diff --git a/examples/link_prediction/homogeneous_training.py b/examples/link_prediction/homogeneous_training.py index da5796cb9..9548b678c 100644 --- a/examples/link_prediction/homogeneous_training.py +++ b/examples/link_prediction/homogeneous_training.py @@ -290,7 +290,6 @@ class TrainingProcessArgs: num_val_batches (int): Number of validation batches across all processes. val_every_n_batch (int): Frequency to run validation during training. log_every_n_batch (int): Frequency to log batch information during training. - should_log_to_tensorboard (bool): If True, emit TensorBoard summaries. should_skip_training (bool): If True, skip training and only run testing. """ @@ -328,7 +327,6 @@ class TrainingProcessArgs: num_val_batches: int val_every_n_batch: int log_every_n_batch: int - should_log_to_tensorboard: bool should_skip_training: bool @@ -367,7 +365,7 @@ def _training_process( is_chief_process = args.machine_rank == 0 and local_rank == 0 tensorboard_writer = TensorBoardWriter.from_uri( args.tensorboard_log_uri, - enabled=args.should_log_to_tensorboard and is_chief_process, + enabled=is_chief_process, ) loss_fn = RetrievalLoss( @@ -848,9 +846,6 @@ def _run_example_training( else None ) - should_log_to_tensorboard = ( - gbml_config_pb_wrapper.trainer_config.should_log_to_tensorboard - ) should_skip_training = gbml_config_pb_wrapper.shared_config.should_skip_training logger.info("--- Launching training processes ...\n") @@ -882,7 +877,6 @@ def _run_example_training( num_val_batches=num_val_batches, val_every_n_batch=val_every_n_batch, log_every_n_batch=log_every_n_batch, - should_log_to_tensorboard=should_log_to_tensorboard, should_skip_training=should_skip_training, ) From 187123c7480d8e3aff9643f66bb9156b46c07715 Mon Sep 17 00:00:00 2001 From: kmontemayor Date: Wed, 29 Apr 2026 21:52:50 +0000 Subject: [PATCH 15/59] refactor(trainer): always forward tensorboard_logs_uri when proto is set V1 and V2 trainer launchers previously gated tensorboard_logs_uri construction on both should_log_to_tensorboard AND a non-empty proto field. Drop the should_log_to_tensorboard half so the URI is forwarded to the Vertex AI launcher whenever the proto field is set. One signal (URI presence) instead of two. Co-Authored-By: Claude Opus 4.7 (1M context) --- gigl/src/training/v1/trainer.py | 3 +-- gigl/src/training/v2/glt_trainer.py | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/gigl/src/training/v1/trainer.py b/gigl/src/training/v1/trainer.py index cdfbf4748..0c8790663 100644 --- a/gigl/src/training/v1/trainer.py +++ b/gigl/src/training/v1/trainer.py @@ -52,8 +52,7 @@ def run( raw_tensorboard_logs_uri = gbml_config_pb_wrapper.shared_config.trained_model_metadata.tensorboard_logs_uri tensorboard_logs_uri = ( UriFactory.create_uri(raw_tensorboard_logs_uri) - if gbml_config_pb_wrapper.trainer_config.should_log_to_tensorboard - and raw_tensorboard_logs_uri + if raw_tensorboard_logs_uri else None ) launch_single_pool_job( diff --git a/gigl/src/training/v2/glt_trainer.py b/gigl/src/training/v2/glt_trainer.py index 15e225b92..4f2ecadd1 100644 --- a/gigl/src/training/v2/glt_trainer.py +++ b/gigl/src/training/v2/glt_trainer.py @@ -57,8 +57,7 @@ def __execute_VAI_training( raw_tensorboard_logs_uri = gbml_config_pb_wrapper.shared_config.trained_model_metadata.tensorboard_logs_uri tensorboard_logs_uri = ( UriFactory.create_uri(raw_tensorboard_logs_uri) - if gbml_config_pb_wrapper.trainer_config.should_log_to_tensorboard - and raw_tensorboard_logs_uri + if raw_tensorboard_logs_uri else None ) From 3b7133cdf5892eb36d7b8f2c99f8311c4bd8e565 Mon Sep 17 00:00:00 2001 From: kmontemayor Date: Thu, 30 Apr 2026 19:56:42 +0000 Subject: [PATCH 16/59] refactor(tensorboard): replace from_uri with from_env MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit TensorBoardWriter now reads AIP_TENSORBOARD_LOG_DIR directly and raises when unset. Vertex AI populates that env var from CustomJobSpec.baseOutputDirectory, which GiGL's launcher already derives from tensorboardLogsUri — so the prior fallback in _resolve_log_dir was dead weight. Co-Authored-By: Claude Opus 4.7 (1M context) --- gigl/utils/tensorboard_writer.py | 77 ++++++++++----------- tests/unit/utils/tensorboard_writer_test.py | 43 +++++------- 2 files changed, 55 insertions(+), 65 deletions(-) diff --git a/gigl/utils/tensorboard_writer.py b/gigl/utils/tensorboard_writer.py index d7fd54c2f..3e0e674eb 100644 --- a/gigl/utils/tensorboard_writer.py +++ b/gigl/utils/tensorboard_writer.py @@ -5,43 +5,30 @@ import tensorflow as tf -from gigl.common import Uri - +# Vertex AI sets this env var to ``/logs/`` (or +# ``//logs/`` for HyperparameterTuningJob trials) +# when ``CustomJobSpec.baseOutputDirectory`` is configured. GiGL's launcher +# derives ``baseOutputDirectory`` from the GbmlConfig's ``tensorboardLogsUri`` +# (see ``gigl/src/common/vertex_ai_launcher.py``), so within a GiGL-launched +# trainer this env var is the authoritative log directory. +# +# References: +# https://cloud.google.com/vertex-ai/docs/training/code-requirements +# https://cloud.google.com/vertex-ai/docs/reference/rest/v1/CustomJobSpec#FIELDS.base_output_directory _VERTEX_TENSORBOARD_LOG_DIR_ENV_KEY = "AIP_TENSORBOARD_LOG_DIR" -def _resolve_log_dir(configured_uri: Optional[Uri]) -> Optional[str]: - """Resolve the TensorBoard log directory. - - Vertex AI populates ``AIP_TENSORBOARD_LOG_DIR`` when ``baseOutputDirectory`` - is configured on a CustomJob. Outside Vertex AI, GiGL falls back to the - URI from the task config. - - Args: - configured_uri: The TensorBoard URI from GiGL config. - - Returns: - The resolved log directory, or ``None`` when no directory is available. - """ - vertex_log_dir = os.environ.get(_VERTEX_TENSORBOARD_LOG_DIR_ENV_KEY) - if vertex_log_dir: - return vertex_log_dir - if configured_uri is None: - return None - return configured_uri.uri - - class TensorBoardWriter: """Writes scalar metrics to TensorBoard. - No-ops when disabled or when no log directory is available, so callers - never see ``Optional[TensorBoardWriter]`` plumbing. + No-ops when disabled, so callers never see ``Optional[TensorBoardWriter]`` + plumbing across chief / non-chief ranks. The writer flushes after every ``log()`` call so that Vertex's TensorBoard UI sees events live as training progresses. Example: - >>> with TensorBoardWriter.from_uri(uri, enabled=is_chief and should_log) as tb: + >>> with TensorBoardWriter.from_env(enabled=is_chief_process) as tb: ... tb.log({"Loss/train": loss, "Loss/val": vloss}, step=batch_idx) """ @@ -58,30 +45,40 @@ def __init__(self, log_dir: Optional[str]) -> None: self._closed = False @classmethod - def from_uri( - cls, - configured_uri: Optional[Uri], - *, - enabled: bool = True, - ) -> "TensorBoardWriter": - """Build a writer with Vertex AI env-var precedence. + def from_env(cls, *, enabled: bool = True) -> "TensorBoardWriter": + """Build a writer from Vertex AI's ``AIP_TENSORBOARD_LOG_DIR`` env var. When ``enabled`` is ``False``, returns a no-op writer without reading - the environment or the configured URI. + the environment. This is the path non-chief ranks take so they can + share the same call sites as the chief. + + When ``enabled`` is ``True``, the env var must be set; otherwise this + raises ``RuntimeError`` rather than silently no-op'ing. The env var is + populated by Vertex AI from ``CustomJobSpec.baseOutputDirectory`` (see + the references in this module's header). Args: - configured_uri: The TensorBoard URI from GiGL config. Used only - when ``AIP_TENSORBOARD_LOG_DIR`` is unset. enabled: Whether this caller is responsible for writing events. - Typically ``should_log_to_tensorboard and is_chief_process``. + Typically ``is_chief_process``. Returns: - A ``TensorBoardWriter`` instance — real if enabled and a log - directory was resolved, no-op otherwise. + A ``TensorBoardWriter`` instance — real if enabled, no-op otherwise. + + Raises: + RuntimeError: If ``enabled`` is True and ``AIP_TENSORBOARD_LOG_DIR`` + is not set in the environment. """ if not enabled: return cls(log_dir=None) - return cls(log_dir=_resolve_log_dir(configured_uri)) + log_dir = os.environ.get(_VERTEX_TENSORBOARD_LOG_DIR_ENV_KEY) + if not log_dir: + raise RuntimeError( + f"{_VERTEX_TENSORBOARD_LOG_DIR_ENV_KEY} is not set. " + "TensorBoardWriter.from_env() requires the trainer to run as " + "a Vertex AI CustomJob with baseOutputDirectory configured. " + "See https://cloud.google.com/vertex-ai/docs/reference/rest/v1/CustomJobSpec#FIELDS.base_output_directory." + ) + return cls(log_dir=log_dir) def log(self, metrics: dict[str, float], step: int) -> None: """Write each metric scalar at ``step`` and flush. diff --git a/tests/unit/utils/tensorboard_writer_test.py b/tests/unit/utils/tensorboard_writer_test.py index 79b428adf..9d11eca6d 100644 --- a/tests/unit/utils/tensorboard_writer_test.py +++ b/tests/unit/utils/tensorboard_writer_test.py @@ -5,7 +5,6 @@ from absl.testing import absltest -from gigl.common import UriFactory from gigl.utils.tensorboard_writer import TensorBoardWriter from tests.test_assets.test_case import TestCase @@ -13,19 +12,9 @@ class TestTensorBoardWriter(TestCase): """Tests for the TensorBoardWriter class.""" - def test_from_uri_returns_noop_when_disabled(self) -> None: - configured_uri = UriFactory.create_uri("gs://config/logs/") - with patch( - "gigl.utils.tensorboard_writer.tf.summary.create_file_writer" - ) as mock_create_file_writer: - writer = TensorBoardWriter.from_uri(configured_uri, enabled=False) - writer.log({"Loss/train": 1.0}, step=0) - writer.close() - - mock_create_file_writer.assert_not_called() - - def test_from_uri_prefers_vertex_env_var(self) -> None: - configured_uri = UriFactory.create_uri("gs://config/logs/") + def test_from_env_returns_noop_when_disabled(self) -> None: + # When disabled (e.g. non-chief rank), env var state is irrelevant + # and no TF writer is constructed. with patch.dict( os.environ, {"AIP_TENSORBOARD_LOG_DIR": "gs://vertex-managed/logs"}, @@ -34,28 +23,32 @@ def test_from_uri_prefers_vertex_env_var(self) -> None: with patch( "gigl.utils.tensorboard_writer.tf.summary.create_file_writer" ) as mock_create_file_writer: - TensorBoardWriter.from_uri(configured_uri) + writer = TensorBoardWriter.from_env(enabled=False) + writer.log({"Loss/train": 1.0}, step=0) + writer.close() - mock_create_file_writer.assert_called_once_with("gs://vertex-managed/logs") + mock_create_file_writer.assert_not_called() - def test_from_uri_falls_back_to_configured_uri(self) -> None: - configured_uri = UriFactory.create_uri("gs://config/logs/") - with patch.dict(os.environ, {}, clear=True): + def test_from_env_uses_vertex_env_var(self) -> None: + with patch.dict( + os.environ, + {"AIP_TENSORBOARD_LOG_DIR": "gs://vertex-managed/logs"}, + clear=False, + ): with patch( "gigl.utils.tensorboard_writer.tf.summary.create_file_writer" ) as mock_create_file_writer: - TensorBoardWriter.from_uri(configured_uri) + TensorBoardWriter.from_env() - mock_create_file_writer.assert_called_once_with(configured_uri.uri) + mock_create_file_writer.assert_called_once_with("gs://vertex-managed/logs") - def test_from_uri_returns_noop_when_no_uri_anywhere(self) -> None: + def test_from_env_raises_when_env_var_missing(self) -> None: with patch.dict(os.environ, {}, clear=True): with patch( "gigl.utils.tensorboard_writer.tf.summary.create_file_writer" ) as mock_create_file_writer: - writer = TensorBoardWriter.from_uri(configured_uri=None) - writer.log({"Loss/train": 1.0}, step=0) - writer.close() + with self.assertRaises(RuntimeError): + TensorBoardWriter.from_env() mock_create_file_writer.assert_not_called() From 96ea473f68eaebef5c83255d1a5c58c9e72eb18d Mon Sep 17 00:00:00 2001 From: kmontemayor Date: Thu, 30 Apr 2026 20:50:26 +0000 Subject: [PATCH 17/59] refactor(examples): drop tensorboard URI plumbing in homogeneous_training Now that TensorBoardWriter reads AIP_TENSORBOARD_LOG_DIR directly via from_env(), the example no longer needs to plumb tensorboard_logs_uri from the GbmlConfig through TrainingProcessArgs to the worker process. Co-Authored-By: Claude Opus 4.7 (1M context) --- examples/link_prediction/homogeneous_training.py | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/examples/link_prediction/homogeneous_training.py b/examples/link_prediction/homogeneous_training.py index 9548b678c..3779605f9 100644 --- a/examples/link_prediction/homogeneous_training.py +++ b/examples/link_prediction/homogeneous_training.py @@ -270,7 +270,6 @@ class TrainingProcessArgs: model_uri (Uri): URI to save/load the trained model state dict. eval_metrics_uri (Optional[Uri]): Destination URI for writing evaluation metrics in KFP-compatible JSON format. If None, metrics are not written. - tensorboard_log_uri (Optional[Uri]): Destination URI for TensorBoard logs. hid_dim (int): Hidden dimension of the model. out_dim (int): Output dimension of the model. node_feature_dim (int): Input node feature dimension for the model. @@ -306,7 +305,6 @@ class TrainingProcessArgs: # Model model_uri: Uri eval_metrics_uri: Optional[Uri] - tensorboard_log_uri: Optional[Uri] hid_dim: int out_dim: int node_feature_dim: int @@ -363,10 +361,7 @@ def _training_process( logger.info(f"---Rank {rank} training process group initialized") is_chief_process = args.machine_rank == 0 and local_rank == 0 - tensorboard_writer = TensorBoardWriter.from_uri( - args.tensorboard_log_uri, - enabled=is_chief_process, - ) + tensorboard_writer = TensorBoardWriter.from_env(enabled=is_chief_process) loss_fn = RetrievalLoss( loss=torch.nn.CrossEntropyLoss(reduction="mean"), @@ -839,12 +834,6 @@ def _run_example_training( eval_metrics_uri: Optional[Uri] = ( UriFactory.create_uri(raw_eval_metrics_uri) if raw_eval_metrics_uri else None ) - raw_tensorboard_log_uri = gbml_config_pb_wrapper.gbml_config_pb.shared_config.trained_model_metadata.tensorboard_logs_uri - tensorboard_log_uri: Optional[Uri] = ( - UriFactory.create_uri(raw_tensorboard_log_uri) - if raw_tensorboard_log_uri - else None - ) should_skip_training = gbml_config_pb_wrapper.shared_config.should_skip_training @@ -860,7 +849,6 @@ def _run_example_training( dataset=dataset, model_uri=model_uri, eval_metrics_uri=eval_metrics_uri, - tensorboard_log_uri=tensorboard_log_uri, hid_dim=hid_dim, out_dim=out_dim, node_feature_dim=node_feature_dim, From 23f0aaebfa04e66aa7e3ea613e0bbeca899c7d4f Mon Sep 17 00:00:00 2001 From: kmontemayor Date: Thu, 30 Apr 2026 21:53:52 +0000 Subject: [PATCH 18/59] refactor(examples): drop tensorboard URI plumbing from training examples Now that TensorBoardWriter reads AIP_TENSORBOARD_LOG_DIR directly via from_env(), the examples no longer need to plumb tensorboard_logs_uri from the GbmlConfig through TrainingProcessArgs to the worker process. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../graph_store/heterogeneous_training.py | 15 ++------------- .../graph_store/homogeneous_training.py | 15 ++------------- .../link_prediction/heterogeneous_training.py | 14 +------------- 3 files changed, 5 insertions(+), 39 deletions(-) diff --git a/examples/link_prediction/graph_store/heterogeneous_training.py b/examples/link_prediction/graph_store/heterogeneous_training.py index 76fc9d633..059e63adc 100644 --- a/examples/link_prediction/graph_store/heterogeneous_training.py +++ b/examples/link_prediction/graph_store/heterogeneous_training.py @@ -373,7 +373,6 @@ class TrainingProcessArgs: sharing between local processes. supervision_edge_type (EdgeType): The supervision edge type for training. model_uri (Uri): URI to save/load the trained model state dict. - tensorboard_log_uri (Optional[Uri]): Destination URI for TensorBoard logs. hid_dim (int): Hidden dimension of the model. out_dim (int): Output dimension of the model. node_type_to_feature_dim (dict[NodeType, int]): Mapping of node types to their feature dimensions. @@ -403,7 +402,6 @@ class TrainingProcessArgs: # Model model_uri: Uri eval_metrics_uri: Optional[Uri] - tensorboard_log_uri: Optional[Uri] hid_dim: int out_dim: int node_type_to_feature_dim: dict[NodeType, int] @@ -462,10 +460,8 @@ def _training_process( if torch.cuda.is_available(): torch.cuda.set_device(device) print(f"---Rank {rank} training process set device {device}") - tensorboard_writer = TensorBoardWriter.from_uri( - args.tensorboard_log_uri, - enabled=rank == 0, - ) + is_chief_process = rank == 0 + tensorboard_writer = TensorBoardWriter.from_env(enabled=is_chief_process) loss_fn = RetrievalLoss( loss=torch.nn.CrossEntropyLoss(reduction="mean"), @@ -948,12 +944,6 @@ def _run_example_training( eval_metrics_uri: Optional[Uri] = ( UriFactory.create_uri(raw_eval_metrics_uri) if raw_eval_metrics_uri else None ) - raw_tensorboard_log_uri = gbml_config_pb_wrapper.gbml_config_pb.shared_config.trained_model_metadata.tensorboard_logs_uri - tensorboard_log_uri: Optional[Uri] = ( - UriFactory.create_uri(raw_tensorboard_log_uri) - if raw_tensorboard_log_uri - else None - ) should_skip_training = gbml_config_pb_wrapper.shared_config.should_skip_training @@ -977,7 +967,6 @@ def _run_example_training( supervision_edge_type=supervision_edge_type, model_uri=model_uri, eval_metrics_uri=eval_metrics_uri, - tensorboard_log_uri=tensorboard_log_uri, hid_dim=hid_dim, out_dim=out_dim, node_type_to_feature_dim=node_type_to_feature_dim, diff --git a/examples/link_prediction/graph_store/homogeneous_training.py b/examples/link_prediction/graph_store/homogeneous_training.py index b8e683b98..5a68262ae 100644 --- a/examples/link_prediction/graph_store/homogeneous_training.py +++ b/examples/link_prediction/graph_store/homogeneous_training.py @@ -368,7 +368,6 @@ class TrainingProcessArgs: model_uri (Uri): URI to save/load the trained model state dict. eval_metrics_uri (Optional[Uri]): Destination URI for writing evaluation metrics in KFP-compatible JSON format. If None, metrics are not written. - tensorboard_log_uri (Optional[Uri]): Destination URI for TensorBoard logs. hid_dim (int): Hidden dimension of the model. out_dim (int): Output dimension of the model. node_feature_dim (int): Input node feature dimension for the model. @@ -395,7 +394,6 @@ class TrainingProcessArgs: # Model model_uri: Uri eval_metrics_uri: Optional[Uri] - tensorboard_log_uri: Optional[Uri] hid_dim: int out_dim: int node_feature_dim: int @@ -453,10 +451,8 @@ def _training_process( if torch.cuda.is_available(): torch.cuda.set_device(device) logger.info(f"---Rank {rank} training process set device {device}") - tensorboard_writer = TensorBoardWriter.from_uri( - args.tensorboard_log_uri, - enabled=rank == 0, - ) + is_chief_process = rank == 0 + tensorboard_writer = TensorBoardWriter.from_env(enabled=is_chief_process) loss_fn = RetrievalLoss( loss=torch.nn.CrossEntropyLoss(reduction="mean"), @@ -935,12 +931,6 @@ def _run_example_training( eval_metrics_uri: Optional[Uri] = ( UriFactory.create_uri(raw_eval_metrics_uri) if raw_eval_metrics_uri else None ) - raw_tensorboard_log_uri = gbml_config_pb_wrapper.gbml_config_pb.shared_config.trained_model_metadata.tensorboard_logs_uri - tensorboard_log_uri: Optional[Uri] = ( - UriFactory.create_uri(raw_tensorboard_log_uri) - if raw_tensorboard_log_uri - else None - ) should_skip_training = gbml_config_pb_wrapper.shared_config.should_skip_training @@ -954,7 +944,6 @@ def _run_example_training( cluster_info=cluster_info, model_uri=model_uri, eval_metrics_uri=eval_metrics_uri, - tensorboard_log_uri=tensorboard_log_uri, hid_dim=hid_dim, out_dim=out_dim, node_feature_dim=node_feature_dim, diff --git a/examples/link_prediction/heterogeneous_training.py b/examples/link_prediction/heterogeneous_training.py index 282c1fd6f..62bde60c5 100644 --- a/examples/link_prediction/heterogeneous_training.py +++ b/examples/link_prediction/heterogeneous_training.py @@ -309,7 +309,6 @@ class TrainingProcessArgs: model_uri (Uri): URI to save/load the trained model state dict. eval_metrics_uri (Optional[Uri]): Destination URI for writing evaluation metrics in KFP-compatible JSON format. If None, metrics are not written. - tensorboard_log_uri (Optional[Uri]): Destination URI for TensorBoard logs. hid_dim (int): Hidden dimension of the model. out_dim (int): Output dimension of the model. node_type_to_feature_dim (dict[NodeType, int]): Mapping of node types to their feature @@ -348,7 +347,6 @@ class TrainingProcessArgs: # Model model_uri: Uri eval_metrics_uri: Optional[Uri] - tensorboard_log_uri: Optional[Uri] hid_dim: int out_dim: int node_type_to_feature_dim: dict[NodeType, int] @@ -404,10 +402,7 @@ def _training_process( torch.cuda.set_device(device) logger.info(f"---Rank {rank} training process set device {device}") is_chief_process = args.machine_rank == 0 and local_rank == 0 - tensorboard_writer = TensorBoardWriter.from_uri( - args.tensorboard_log_uri, - enabled=is_chief_process, - ) + tensorboard_writer = TensorBoardWriter.from_env(enabled=is_chief_process) loss_fn = RetrievalLoss( loss=torch.nn.CrossEntropyLoss(reduction="mean"), temperature=0.07, @@ -903,12 +898,6 @@ def _run_example_training( eval_metrics_uri: Optional[Uri] = ( UriFactory.create_uri(raw_eval_metrics_uri) if raw_eval_metrics_uri else None ) - raw_tensorboard_log_uri = gbml_config_pb_wrapper.gbml_config_pb.shared_config.trained_model_metadata.tensorboard_logs_uri - tensorboard_log_uri: Optional[Uri] = ( - UriFactory.create_uri(raw_tensorboard_log_uri) - if raw_tensorboard_log_uri - else None - ) should_skip_training = gbml_config_pb_wrapper.shared_config.should_skip_training @@ -934,7 +923,6 @@ def _run_example_training( supervision_edge_type=supervision_edge_type, model_uri=model_uri, eval_metrics_uri=eval_metrics_uri, - tensorboard_log_uri=tensorboard_log_uri, hid_dim=hid_dim, out_dim=out_dim, node_type_to_feature_dim=node_type_to_feature_dim, From 1c14a16848383e405b7b9360e99340170896ab20 Mon Sep 17 00:00:00 2001 From: kmontemayor Date: Fri, 1 May 2026 00:09:46 +0000 Subject: [PATCH 19/59] Update --- examples/link_prediction/graph_store/heterogeneous_training.py | 2 +- examples/link_prediction/graph_store/homogeneous_training.py | 2 +- examples/link_prediction/heterogeneous_training.py | 2 +- examples/link_prediction/homogeneous_training.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/link_prediction/graph_store/heterogeneous_training.py b/examples/link_prediction/graph_store/heterogeneous_training.py index 059e63adc..1c0e956a0 100644 --- a/examples/link_prediction/graph_store/heterogeneous_training.py +++ b/examples/link_prediction/graph_store/heterogeneous_training.py @@ -113,9 +113,9 @@ ) from gigl.src.common.types.pb_wrappers.gbml_config import GbmlConfigPbWrapper from gigl.src.common.utils.model import load_state_dict_from_uri, save_state_dict -from gigl.utils.tensorboard_writer import TensorBoardWriter from gigl.utils.iterator import InfiniteIterator from gigl.utils.sampling import parse_fanout +from gigl.utils.tensorboard_writer import TensorBoardWriter logger = Logger() diff --git a/examples/link_prediction/graph_store/homogeneous_training.py b/examples/link_prediction/graph_store/homogeneous_training.py index 5a68262ae..8bc93f535 100644 --- a/examples/link_prediction/graph_store/homogeneous_training.py +++ b/examples/link_prediction/graph_store/homogeneous_training.py @@ -157,9 +157,9 @@ ) from gigl.src.common.types.pb_wrappers.gbml_config import GbmlConfigPbWrapper from gigl.src.common.utils.model import load_state_dict_from_uri, save_state_dict -from gigl.utils.tensorboard_writer import TensorBoardWriter from gigl.utils.iterator import InfiniteIterator from gigl.utils.sampling import parse_fanout +from gigl.utils.tensorboard_writer import TensorBoardWriter logger = Logger() diff --git a/examples/link_prediction/heterogeneous_training.py b/examples/link_prediction/heterogeneous_training.py index 62bde60c5..3910d67ad 100644 --- a/examples/link_prediction/heterogeneous_training.py +++ b/examples/link_prediction/heterogeneous_training.py @@ -63,9 +63,9 @@ ) from gigl.src.common.types.pb_wrappers.gbml_config import GbmlConfigPbWrapper from gigl.src.common.utils.model import load_state_dict_from_uri, save_state_dict -from gigl.utils.tensorboard_writer import TensorBoardWriter from gigl.utils.iterator import InfiniteIterator from gigl.utils.sampling import parse_fanout +from gigl.utils.tensorboard_writer import TensorBoardWriter logger = Logger() diff --git a/examples/link_prediction/homogeneous_training.py b/examples/link_prediction/homogeneous_training.py index 3779605f9..6470ab1ef 100644 --- a/examples/link_prediction/homogeneous_training.py +++ b/examples/link_prediction/homogeneous_training.py @@ -58,10 +58,10 @@ ) from gigl.src.common.types.pb_wrappers.gbml_config import GbmlConfigPbWrapper from gigl.src.common.utils.model import load_state_dict_from_uri, save_state_dict -from gigl.utils.tensorboard_writer import TensorBoardWriter from gigl.types.graph import to_homogeneous from gigl.utils.iterator import InfiniteIterator from gigl.utils.sampling import parse_fanout +from gigl.utils.tensorboard_writer import TensorBoardWriter logger = Logger() From 667df9b64801ab1b57c0cd7b69913623dc3549c1 Mon Sep 17 00:00:00 2001 From: kmontemayor Date: Mon, 4 May 2026 16:50:28 +0000 Subject: [PATCH 20/59] proto: add TrainerConfig.tensorboard_experiment_name --- .../snapchat/research/gbml/gbml_config.proto | 10 +++ .../gbml/gbml_config/GbmlConfig.scala | 39 ++++++++++++ .../gbml/gbml_config/GbmlConfigProto.scala | 61 ++++++++++--------- .../gbml/gbml_config/GbmlConfig.scala | 39 ++++++++++++ .../gbml/gbml_config/GbmlConfigProto.scala | 61 ++++++++++--------- snapchat/research/gbml/gbml_config_pb2.py | 46 +++++++------- snapchat/research/gbml/gbml_config_pb2.pyi | 14 ++++- 7 files changed, 186 insertions(+), 84 deletions(-) diff --git a/proto/snapchat/research/gbml/gbml_config.proto b/proto/snapchat/research/gbml/gbml_config.proto index b8e50d834..605ee500a 100644 --- a/proto/snapchat/research/gbml/gbml_config.proto +++ b/proto/snapchat/research/gbml/gbml_config.proto @@ -201,6 +201,16 @@ message GbmlConfig { // Weather to log to tensorboard or not (defaults to false) bool should_log_to_tensorboard = 12; + // Optional. When set, the trainer's CustomJob is submitted as a run of + // a Vertex AI Experiment with this name (instead of attaching the raw + // Tensorboard resource directly). Multiple jobs that share the same + // value land in the same backing TensorboardExperiment, so they appear + // as comparable runs on one TensorBoard page. Requires + // GiglResourceConfig...tensorboard_resource_name to be set; that TB + // becomes the experiment's backing TB. Allowed characters: lowercase + // letters, digits, hyphens (Vertex AI Experiment ID rules). + string tensorboard_experiment_name = 14; + // Configuration for GraphStore storage. // If setup, then GiGLResourceConfig.trainer_resource_config.vertex_ai_graph_store_trainer_config must be set. // e.g. With separte job configs for storage and compute jobs. diff --git a/scala/common/src/main/scala/snapchat/research/gbml/gbml_config/GbmlConfig.scala b/scala/common/src/main/scala/snapchat/research/gbml/gbml_config/GbmlConfig.scala index 63c31ede6..b7f0507e5 100644 --- a/scala/common/src/main/scala/snapchat/research/gbml/gbml_config/GbmlConfig.scala +++ b/scala/common/src/main/scala/snapchat/research/gbml/gbml_config/GbmlConfig.scala @@ -3966,6 +3966,15 @@ object GbmlConfig extends scalapb.GeneratedMessageCompanion[snapchat.research.gb * Arguments to parameterize training process with. * @param shouldLogToTensorboard * Weather to log to tensorboard or not (defaults to false) + * @param tensorboardExperimentName + * Optional. When set, the trainer's CustomJob is submitted as a run of + * a Vertex AI Experiment with this name (instead of attaching the raw + * Tensorboard resource directly). Multiple jobs that share the same + * value land in the same backing TensorboardExperiment, so they appear + * as comparable runs on one TensorBoard page. Requires + * GiglResourceConfig...tensorboard_resource_name to be set; that TB + * becomes the experiment's backing TB. Allowed characters: lowercase + * letters, digits, hyphens (Vertex AI Experiment ID rules). */ @SerialVersionUID(0L) final case class TrainerConfig( @@ -3973,6 +3982,7 @@ object GbmlConfig extends scalapb.GeneratedMessageCompanion[snapchat.research.gb trainerArgs: _root_.scala.collection.immutable.Map[_root_.scala.Predef.String, _root_.scala.Predef.String] = _root_.scala.collection.immutable.Map.empty, executable: snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.Executable = snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.Executable.Empty, shouldLogToTensorboard: _root_.scala.Boolean = false, + tensorboardExperimentName: _root_.scala.Predef.String = "", storageConfig: snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.StorageConfig = snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.StorageConfig.Empty, unknownFields: _root_.scalapb.UnknownFieldSet = _root_.scalapb.UnknownFieldSet.empty ) extends scalapb.GeneratedMessage with scalapb.lenses.Updatable[TrainerConfig] { @@ -4006,6 +4016,13 @@ object GbmlConfig extends scalapb.GeneratedMessageCompanion[snapchat.research.gb __size += _root_.com.google.protobuf.CodedOutputStream.computeBoolSize(12, __value) } }; + + { + val __value = tensorboardExperimentName + if (!__value.isEmpty) { + __size += _root_.com.google.protobuf.CodedOutputStream.computeStringSize(14, __value) + } + }; if (storageConfig.graphStoreStorageConfig.isDefined) { val __value = storageConfig.graphStoreStorageConfig.get __size += 1 + _root_.com.google.protobuf.CodedOutputStream.computeUInt32SizeNoTag(__value.serializedSize) + __value.serializedSize @@ -4047,6 +4064,12 @@ object GbmlConfig extends scalapb.GeneratedMessageCompanion[snapchat.research.gb _output__.writeUInt32NoTag(__m.serializedSize) __m.writeTo(_output__) }; + { + val __v = tensorboardExperimentName + if (!__v.isEmpty) { + _output__.writeString(14, __v) + } + }; executable.clsPath.foreach { __v => val __m = __v _output__.writeString(100, __m) @@ -4067,6 +4090,7 @@ object GbmlConfig extends scalapb.GeneratedMessageCompanion[snapchat.research.gb def getCommand: _root_.scala.Predef.String = executable.command.getOrElse("") def withCommand(__v: _root_.scala.Predef.String): TrainerConfig = copy(executable = snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.Executable.Command(__v)) def withShouldLogToTensorboard(__v: _root_.scala.Boolean): TrainerConfig = copy(shouldLogToTensorboard = __v) + def withTensorboardExperimentName(__v: _root_.scala.Predef.String): TrainerConfig = copy(tensorboardExperimentName = __v) def getGraphStoreStorageConfig: snapchat.research.gbml.gbml_config.GbmlConfig.GraphStoreStorageConfig = storageConfig.graphStoreStorageConfig.getOrElse(snapchat.research.gbml.gbml_config.GbmlConfig.GraphStoreStorageConfig.defaultInstance) def withGraphStoreStorageConfig(__v: snapchat.research.gbml.gbml_config.GbmlConfig.GraphStoreStorageConfig): TrainerConfig = copy(storageConfig = snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.StorageConfig.GraphStoreStorageConfig(__v)) def clearExecutable: TrainerConfig = copy(executable = snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.Executable.Empty) @@ -4088,6 +4112,10 @@ object GbmlConfig extends scalapb.GeneratedMessageCompanion[snapchat.research.gb val __t = shouldLogToTensorboard if (__t != false) __t else null } + case 14 => { + val __t = tensorboardExperimentName + if (__t != "") __t else null + } case 13 => storageConfig.graphStoreStorageConfig.orNull } } @@ -4099,6 +4127,7 @@ object GbmlConfig extends scalapb.GeneratedMessageCompanion[snapchat.research.gb case 100 => executable.clsPath.map(_root_.scalapb.descriptors.PString(_)).getOrElse(_root_.scalapb.descriptors.PEmpty) case 101 => executable.command.map(_root_.scalapb.descriptors.PString(_)).getOrElse(_root_.scalapb.descriptors.PEmpty) case 12 => _root_.scalapb.descriptors.PBoolean(shouldLogToTensorboard) + case 14 => _root_.scalapb.descriptors.PString(tensorboardExperimentName) case 13 => storageConfig.graphStoreStorageConfig.map(_.toPMessage).getOrElse(_root_.scalapb.descriptors.PEmpty) } } @@ -4113,6 +4142,7 @@ object GbmlConfig extends scalapb.GeneratedMessageCompanion[snapchat.research.gb var __trainerClsPath: _root_.scala.Predef.String = "" val __trainerArgs: _root_.scala.collection.mutable.Builder[(_root_.scala.Predef.String, _root_.scala.Predef.String), _root_.scala.collection.immutable.Map[_root_.scala.Predef.String, _root_.scala.Predef.String]] = _root_.scala.collection.immutable.Map.newBuilder[_root_.scala.Predef.String, _root_.scala.Predef.String] var __shouldLogToTensorboard: _root_.scala.Boolean = false + var __tensorboardExperimentName: _root_.scala.Predef.String = "" var __executable: snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.Executable = snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.Executable.Empty var __storageConfig: snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.StorageConfig = snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.StorageConfig.Empty var `_unknownFields__`: _root_.scalapb.UnknownFieldSet.Builder = null @@ -4131,6 +4161,8 @@ object GbmlConfig extends scalapb.GeneratedMessageCompanion[snapchat.research.gb __executable = snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.Executable.Command(_input__.readStringRequireUtf8()) case 96 => __shouldLogToTensorboard = _input__.readBool() + case 114 => + __tensorboardExperimentName = _input__.readStringRequireUtf8() case 106 => __storageConfig = snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.StorageConfig.GraphStoreStorageConfig(__storageConfig.graphStoreStorageConfig.fold(_root_.scalapb.LiteParser.readMessage[snapchat.research.gbml.gbml_config.GbmlConfig.GraphStoreStorageConfig](_input__))(_root_.scalapb.LiteParser.readMessage(_input__, _))) case tag => @@ -4144,6 +4176,7 @@ object GbmlConfig extends scalapb.GeneratedMessageCompanion[snapchat.research.gb trainerClsPath = __trainerClsPath, trainerArgs = __trainerArgs.result(), shouldLogToTensorboard = __shouldLogToTensorboard, + tensorboardExperimentName = __tensorboardExperimentName, executable = __executable, storageConfig = __storageConfig, unknownFields = if (_unknownFields__ == null) _root_.scalapb.UnknownFieldSet.empty else _unknownFields__.result() @@ -4156,6 +4189,7 @@ object GbmlConfig extends scalapb.GeneratedMessageCompanion[snapchat.research.gb trainerClsPath = __fieldsMap.get(scalaDescriptor.findFieldByNumber(1).get).map(_.as[_root_.scala.Predef.String]).getOrElse(""), trainerArgs = __fieldsMap.get(scalaDescriptor.findFieldByNumber(2).get).map(_.as[_root_.scala.Seq[snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.TrainerArgsEntry]]).getOrElse(_root_.scala.Seq.empty).iterator.map(snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig._typemapper_trainerArgs.toCustom(_)).toMap, shouldLogToTensorboard = __fieldsMap.get(scalaDescriptor.findFieldByNumber(12).get).map(_.as[_root_.scala.Boolean]).getOrElse(false), + tensorboardExperimentName = __fieldsMap.get(scalaDescriptor.findFieldByNumber(14).get).map(_.as[_root_.scala.Predef.String]).getOrElse(""), executable = __fieldsMap.get(scalaDescriptor.findFieldByNumber(100).get).flatMap(_.as[_root_.scala.Option[_root_.scala.Predef.String]]).map(snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.Executable.ClsPath(_)) .orElse[snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.Executable](__fieldsMap.get(scalaDescriptor.findFieldByNumber(101).get).flatMap(_.as[_root_.scala.Option[_root_.scala.Predef.String]]).map(snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.Executable.Command(_))) .getOrElse(snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.Executable.Empty), @@ -4183,6 +4217,7 @@ object GbmlConfig extends scalapb.GeneratedMessageCompanion[snapchat.research.gb trainerClsPath = "", trainerArgs = _root_.scala.collection.immutable.Map.empty, shouldLogToTensorboard = false, + tensorboardExperimentName = "", executable = snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.Executable.Empty, storageConfig = snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.StorageConfig.Empty ) @@ -4393,6 +4428,7 @@ object GbmlConfig extends scalapb.GeneratedMessageCompanion[snapchat.research.gb def clsPath: _root_.scalapb.lenses.Lens[UpperPB, _root_.scala.Predef.String] = field(_.getClsPath)((c_, f_) => c_.copy(executable = snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.Executable.ClsPath(f_))) def command: _root_.scalapb.lenses.Lens[UpperPB, _root_.scala.Predef.String] = field(_.getCommand)((c_, f_) => c_.copy(executable = snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.Executable.Command(f_))) def shouldLogToTensorboard: _root_.scalapb.lenses.Lens[UpperPB, _root_.scala.Boolean] = field(_.shouldLogToTensorboard)((c_, f_) => c_.copy(shouldLogToTensorboard = f_)) + def tensorboardExperimentName: _root_.scalapb.lenses.Lens[UpperPB, _root_.scala.Predef.String] = field(_.tensorboardExperimentName)((c_, f_) => c_.copy(tensorboardExperimentName = f_)) def graphStoreStorageConfig: _root_.scalapb.lenses.Lens[UpperPB, snapchat.research.gbml.gbml_config.GbmlConfig.GraphStoreStorageConfig] = field(_.getGraphStoreStorageConfig)((c_, f_) => c_.copy(storageConfig = snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.StorageConfig.GraphStoreStorageConfig(f_))) def executable: _root_.scalapb.lenses.Lens[UpperPB, snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.Executable] = field(_.executable)((c_, f_) => c_.copy(executable = f_)) def storageConfig: _root_.scalapb.lenses.Lens[UpperPB, snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.StorageConfig] = field(_.storageConfig)((c_, f_) => c_.copy(storageConfig = f_)) @@ -4402,6 +4438,7 @@ object GbmlConfig extends scalapb.GeneratedMessageCompanion[snapchat.research.gb final val CLS_PATH_FIELD_NUMBER = 100 final val COMMAND_FIELD_NUMBER = 101 final val SHOULD_LOG_TO_TENSORBOARD_FIELD_NUMBER = 12 + final val TENSORBOARD_EXPERIMENT_NAME_FIELD_NUMBER = 14 final val GRAPH_STORE_STORAGE_CONFIG_FIELD_NUMBER = 13 @transient private[gbml_config] val _typemapper_trainerArgs: _root_.scalapb.TypeMapper[snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.TrainerArgsEntry, (_root_.scala.Predef.String, _root_.scala.Predef.String)] = implicitly[_root_.scalapb.TypeMapper[snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.TrainerArgsEntry, (_root_.scala.Predef.String, _root_.scala.Predef.String)]] @@ -4410,12 +4447,14 @@ object GbmlConfig extends scalapb.GeneratedMessageCompanion[snapchat.research.gb trainerArgs: _root_.scala.collection.immutable.Map[_root_.scala.Predef.String, _root_.scala.Predef.String], executable: snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.Executable, shouldLogToTensorboard: _root_.scala.Boolean, + tensorboardExperimentName: _root_.scala.Predef.String, storageConfig: snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.StorageConfig ): _root_.snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig = _root_.snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig( trainerClsPath, trainerArgs, executable, shouldLogToTensorboard, + tensorboardExperimentName, storageConfig ) // @@protoc_insertion_point(GeneratedMessageCompanion[snapchat.research.gbml.GbmlConfig.TrainerConfig]) diff --git a/scala/common/src/main/scala/snapchat/research/gbml/gbml_config/GbmlConfigProto.scala b/scala/common/src/main/scala/snapchat/research/gbml/gbml_config/GbmlConfigProto.scala index a9c35d542..5aa5dd8a7 100644 --- a/scala/common/src/main/scala/snapchat/research/gbml/gbml_config/GbmlConfigProto.scala +++ b/scala/common/src/main/scala/snapchat/research/gbml/gbml_config/GbmlConfigProto.scala @@ -26,7 +26,7 @@ object GbmlConfigProto extends _root_.scalapb.GeneratedFileObject { GhfbWV0YWRhdGEucHJvdG8aLXNuYXBjaGF0L3Jlc2VhcmNoL2dibWwvZGF0YXNldF9tZXRhZGF0YS5wcm90bxozc25hcGNoYXQvc mVzZWFyY2gvZ2JtbC90cmFpbmVkX21vZGVsX21ldGFkYXRhLnByb3RvGi9zbmFwY2hhdC9yZXNlYXJjaC9nYm1sL2luZmVyZW5jZ V9tZXRhZGF0YS5wcm90bxozc25hcGNoYXQvcmVzZWFyY2gvZ2JtbC9wb3N0cHJvY2Vzc2VkX21ldGFkYXRhLnByb3RvGjdzbmFwY - 2hhdC9yZXNlYXJjaC9nYm1sL3N1YmdyYXBoX3NhbXBsaW5nX3N0cmF0ZWd5LnByb3RvIspMCgpHYm1sQ29uZmlnEmcKDXRhc2tfb + 2hhdC9yZXNlYXJjaC9nYm1sL3N1YmdyYXBoX3NhbXBsaW5nX3N0cmF0ZWd5LnByb3RvIqpNCgpHYm1sQ29uZmlnEmcKDXRhc2tfb WV0YWRhdGEYASABKAsyLy5zbmFwY2hhdC5yZXNlYXJjaC5nYm1sLkdibWxDb25maWcuVGFza01ldGFkYXRhQhHiPw4SDHRhc2tNZ XRhZGF0YVIMdGFza01ldGFkYXRhEmAKDmdyYXBoX21ldGFkYXRhGAIgASgLMiUuc25hcGNoYXQucmVzZWFyY2guZ2JtbC5HcmFwa E1ldGFkYXRhQhLiPw8SDWdyYXBoTWV0YWRhdGFSDWdyYXBoTWV0YWRhdGESZwoNc2hhcmVkX2NvbmZpZxgDIAEoCzIvLnNuYXBja @@ -125,39 +125,40 @@ object GbmlConfigProto extends _root_.scalapb.GeneratedFileObject { BgBIAEoCUIM4j8JEgdjb21tYW5kUgdjb21tYW5kEoABCgxzdG9yYWdlX2FyZ3MYAiADKAsySy5zbmFwY2hhdC5yZXNlYXJjaC5nY m1sLkdibWxDb25maWcuR3JhcGhTdG9yZVN0b3JhZ2VDb25maWcuU3RvcmFnZUFyZ3NFbnRyeUIQ4j8NEgtzdG9yYWdlQXJnc1ILc 3RvcmFnZUFyZ3MaVAoQU3RvcmFnZUFyZ3NFbnRyeRIaCgNrZXkYASABKAlCCOI/BRIDa2V5UgNrZXkSIAoFdmFsdWUYAiABKAlCC - uI/BxIFdmFsdWVSBXZhbHVlOgI4ARqDBQoNVHJhaW5lckNvbmZpZxI9ChB0cmFpbmVyX2Nsc19wYXRoGAEgASgJQhPiPxASDnRyY + uI/BxIFdmFsdWVSBXZhbHVlOgI4ARrjBQoNVHJhaW5lckNvbmZpZxI9ChB0cmFpbmVyX2Nsc19wYXRoGAEgASgJQhPiPxASDnRyY WluZXJDbHNQYXRoUg50cmFpbmVyQ2xzUGF0aBJ2Cgx0cmFpbmVyX2FyZ3MYAiADKAsyQS5zbmFwY2hhdC5yZXNlYXJjaC5nYm1sL kdibWxDb25maWcuVHJhaW5lckNvbmZpZy5UcmFpbmVyQXJnc0VudHJ5QhDiPw0SC3RyYWluZXJBcmdzUgt0cmFpbmVyQXJncxIpC ghjbHNfcGF0aBhkIAEoCUIM4j8JEgdjbHNQYXRoSABSB2Nsc1BhdGgSKAoHY29tbWFuZBhlIAEoCUIM4j8JEgdjb21tYW5kSABSB 2NvbW1hbmQSVgoZc2hvdWxkX2xvZ190b190ZW5zb3Jib2FyZBgMIAEoCEIb4j8YEhZzaG91bGRMb2dUb1RlbnNvcmJvYXJkUhZza - G91bGRMb2dUb1RlbnNvcmJvYXJkEpcBChpncmFwaF9zdG9yZV9zdG9yYWdlX2NvbmZpZxgNIAEoCzI6LnNuYXBjaGF0LnJlc2Vhc - mNoLmdibWwuR2JtbENvbmZpZy5HcmFwaFN0b3JlU3RvcmFnZUNvbmZpZ0Ic4j8ZEhdncmFwaFN0b3JlU3RvcmFnZUNvbmZpZ0gBU - hdncmFwaFN0b3JlU3RvcmFnZUNvbmZpZxpUChBUcmFpbmVyQXJnc0VudHJ5EhoKA2tleRgBIAEoCUII4j8FEgNrZXlSA2tleRIgC - gV2YWx1ZRgCIAEoCUIK4j8HEgV2YWx1ZVIFdmFsdWU6AjgBQgwKCmV4ZWN1dGFibGVCEAoOc3RvcmFnZV9jb25maWcalQUKEEluZ - mVyZW5jZXJDb25maWcShQEKD2luZmVyZW5jZXJfYXJncxgBIAMoCzJHLnNuYXBjaGF0LnJlc2VhcmNoLmdibWwuR2JtbENvbmZpZ - y5JbmZlcmVuY2VyQ29uZmlnLkluZmVyZW5jZXJBcmdzRW50cnlCE+I/EBIOaW5mZXJlbmNlckFyZ3NSDmluZmVyZW5jZXJBcmdzE - kYKE2luZmVyZW5jZXJfY2xzX3BhdGgYAiABKAlCFuI/ExIRaW5mZXJlbmNlckNsc1BhdGhSEWluZmVyZW5jZXJDbHNQYXRoEikKC - GNsc19wYXRoGGQgASgJQgziPwkSB2Nsc1BhdGhIAFIHY2xzUGF0aBIoCgdjb21tYW5kGGUgASgJQgziPwkSB2NvbW1hbmRIAFIHY - 29tbWFuZBJJChRpbmZlcmVuY2VfYmF0Y2hfc2l6ZRgFIAEoDUIX4j8UEhJpbmZlcmVuY2VCYXRjaFNpemVSEmluZmVyZW5jZUJhd - GNoU2l6ZRKXAQoaZ3JhcGhfc3RvcmVfc3RvcmFnZV9jb25maWcYBiABKAsyOi5zbmFwY2hhdC5yZXNlYXJjaC5nYm1sLkdibWxDb - 25maWcuR3JhcGhTdG9yZVN0b3JhZ2VDb25maWdCHOI/GRIXZ3JhcGhTdG9yZVN0b3JhZ2VDb25maWdIAVIXZ3JhcGhTdG9yZVN0b - 3JhZ2VDb25maWcaVwoTSW5mZXJlbmNlckFyZ3NFbnRyeRIaCgNrZXkYASABKAlCCOI/BRIDa2V5UgNrZXkSIAoFdmFsdWUYAiABK - AlCCuI/BxIFdmFsdWVSBXZhbHVlOgI4AUIMCgpleGVjdXRhYmxlQhAKDnN0b3JhZ2VfY29uZmlnGtsCChNQb3N0UHJvY2Vzc29yQ - 29uZmlnEpUBChNwb3N0X3Byb2Nlc3Nvcl9hcmdzGAEgAygLMk0uc25hcGNoYXQucmVzZWFyY2guZ2JtbC5HYm1sQ29uZmlnLlBvc - 3RQcm9jZXNzb3JDb25maWcuUG9zdFByb2Nlc3NvckFyZ3NFbnRyeUIW4j8TEhFwb3N0UHJvY2Vzc29yQXJnc1IRcG9zdFByb2Nlc - 3NvckFyZ3MSUAoXcG9zdF9wcm9jZXNzb3JfY2xzX3BhdGgYAiABKAlCGeI/FhIUcG9zdFByb2Nlc3NvckNsc1BhdGhSFHBvc3RQc - m9jZXNzb3JDbHNQYXRoGloKFlBvc3RQcm9jZXNzb3JBcmdzRW50cnkSGgoDa2V5GAEgASgJQgjiPwUSA2tleVIDa2V5EiAKBXZhb - HVlGAIgASgJQgriPwcSBXZhbHVlUgV2YWx1ZToCOAEanAIKDU1ldHJpY3NDb25maWcSPQoQbWV0cmljc19jbHNfcGF0aBgBIAEoC - UIT4j8QEg5tZXRyaWNzQ2xzUGF0aFIObWV0cmljc0Nsc1BhdGgSdgoMbWV0cmljc19hcmdzGAIgAygLMkEuc25hcGNoYXQucmVzZ - WFyY2guZ2JtbC5HYm1sQ29uZmlnLk1ldHJpY3NDb25maWcuTWV0cmljc0FyZ3NFbnRyeUIQ4j8NEgttZXRyaWNzQXJnc1ILbWV0c - mljc0FyZ3MaVAoQTWV0cmljc0FyZ3NFbnRyeRIaCgNrZXkYASABKAlCCOI/BRIDa2V5UgNrZXkSIAoFdmFsdWUYAiABKAlCCuI/B - xIFdmFsdWVSBXZhbHVlOgI4ARr0AgoOUHJvZmlsZXJDb25maWcSTwoWc2hvdWxkX2VuYWJsZV9wcm9maWxlchgBIAEoCEIZ4j8WE - hRzaG91bGRFbmFibGVQcm9maWxlclIUc2hvdWxkRW5hYmxlUHJvZmlsZXISPQoQcHJvZmlsZXJfbG9nX2RpchgCIAEoCUIT4j8QE - g5wcm9maWxlckxvZ0RpclIOcHJvZmlsZXJMb2dEaXISewoNcHJvZmlsZXJfYXJncxgDIAMoCzJDLnNuYXBjaGF0LnJlc2VhcmNoL - mdibWwuR2JtbENvbmZpZy5Qcm9maWxlckNvbmZpZy5Qcm9maWxlckFyZ3NFbnRyeUIR4j8OEgxwcm9maWxlckFyZ3NSDHByb2Zpb - GVyQXJncxpVChFQcm9maWxlckFyZ3NFbnRyeRIaCgNrZXkYASABKAlCCOI/BRIDa2V5UgNrZXkSIAoFdmFsdWUYAiABKAlCCuI/B - xIFdmFsdWVSBXZhbHVlOgI4ARpVChFGZWF0dXJlRmxhZ3NFbnRyeRIaCgNrZXkYASABKAlCCOI/BRIDa2V5UgNrZXkSIAoFdmFsd - WUYAiABKAlCCuI/BxIFdmFsdWVSBXZhbHVlOgI4AWIGcHJvdG8z""" + G91bGRMb2dUb1RlbnNvcmJvYXJkEl4KG3RlbnNvcmJvYXJkX2V4cGVyaW1lbnRfbmFtZRgOIAEoCUIe4j8bEhl0ZW5zb3Jib2FyZ + EV4cGVyaW1lbnROYW1lUhl0ZW5zb3Jib2FyZEV4cGVyaW1lbnROYW1lEpcBChpncmFwaF9zdG9yZV9zdG9yYWdlX2NvbmZpZxgNI + AEoCzI6LnNuYXBjaGF0LnJlc2VhcmNoLmdibWwuR2JtbENvbmZpZy5HcmFwaFN0b3JlU3RvcmFnZUNvbmZpZ0Ic4j8ZEhdncmFwa + FN0b3JlU3RvcmFnZUNvbmZpZ0gBUhdncmFwaFN0b3JlU3RvcmFnZUNvbmZpZxpUChBUcmFpbmVyQXJnc0VudHJ5EhoKA2tleRgBI + AEoCUII4j8FEgNrZXlSA2tleRIgCgV2YWx1ZRgCIAEoCUIK4j8HEgV2YWx1ZVIFdmFsdWU6AjgBQgwKCmV4ZWN1dGFibGVCEAoOc + 3RvcmFnZV9jb25maWcalQUKEEluZmVyZW5jZXJDb25maWcShQEKD2luZmVyZW5jZXJfYXJncxgBIAMoCzJHLnNuYXBjaGF0LnJlc + 2VhcmNoLmdibWwuR2JtbENvbmZpZy5JbmZlcmVuY2VyQ29uZmlnLkluZmVyZW5jZXJBcmdzRW50cnlCE+I/EBIOaW5mZXJlbmNlc + kFyZ3NSDmluZmVyZW5jZXJBcmdzEkYKE2luZmVyZW5jZXJfY2xzX3BhdGgYAiABKAlCFuI/ExIRaW5mZXJlbmNlckNsc1BhdGhSE + WluZmVyZW5jZXJDbHNQYXRoEikKCGNsc19wYXRoGGQgASgJQgziPwkSB2Nsc1BhdGhIAFIHY2xzUGF0aBIoCgdjb21tYW5kGGUgA + SgJQgziPwkSB2NvbW1hbmRIAFIHY29tbWFuZBJJChRpbmZlcmVuY2VfYmF0Y2hfc2l6ZRgFIAEoDUIX4j8UEhJpbmZlcmVuY2VCY + XRjaFNpemVSEmluZmVyZW5jZUJhdGNoU2l6ZRKXAQoaZ3JhcGhfc3RvcmVfc3RvcmFnZV9jb25maWcYBiABKAsyOi5zbmFwY2hhd + C5yZXNlYXJjaC5nYm1sLkdibWxDb25maWcuR3JhcGhTdG9yZVN0b3JhZ2VDb25maWdCHOI/GRIXZ3JhcGhTdG9yZVN0b3JhZ2VDb + 25maWdIAVIXZ3JhcGhTdG9yZVN0b3JhZ2VDb25maWcaVwoTSW5mZXJlbmNlckFyZ3NFbnRyeRIaCgNrZXkYASABKAlCCOI/BRIDa + 2V5UgNrZXkSIAoFdmFsdWUYAiABKAlCCuI/BxIFdmFsdWVSBXZhbHVlOgI4AUIMCgpleGVjdXRhYmxlQhAKDnN0b3JhZ2VfY29uZ + mlnGtsCChNQb3N0UHJvY2Vzc29yQ29uZmlnEpUBChNwb3N0X3Byb2Nlc3Nvcl9hcmdzGAEgAygLMk0uc25hcGNoYXQucmVzZWFyY + 2guZ2JtbC5HYm1sQ29uZmlnLlBvc3RQcm9jZXNzb3JDb25maWcuUG9zdFByb2Nlc3NvckFyZ3NFbnRyeUIW4j8TEhFwb3N0UHJvY + 2Vzc29yQXJnc1IRcG9zdFByb2Nlc3NvckFyZ3MSUAoXcG9zdF9wcm9jZXNzb3JfY2xzX3BhdGgYAiABKAlCGeI/FhIUcG9zdFByb + 2Nlc3NvckNsc1BhdGhSFHBvc3RQcm9jZXNzb3JDbHNQYXRoGloKFlBvc3RQcm9jZXNzb3JBcmdzRW50cnkSGgoDa2V5GAEgASgJQ + gjiPwUSA2tleVIDa2V5EiAKBXZhbHVlGAIgASgJQgriPwcSBXZhbHVlUgV2YWx1ZToCOAEanAIKDU1ldHJpY3NDb25maWcSPQoQb + WV0cmljc19jbHNfcGF0aBgBIAEoCUIT4j8QEg5tZXRyaWNzQ2xzUGF0aFIObWV0cmljc0Nsc1BhdGgSdgoMbWV0cmljc19hcmdzG + AIgAygLMkEuc25hcGNoYXQucmVzZWFyY2guZ2JtbC5HYm1sQ29uZmlnLk1ldHJpY3NDb25maWcuTWV0cmljc0FyZ3NFbnRyeUIQ4 + j8NEgttZXRyaWNzQXJnc1ILbWV0cmljc0FyZ3MaVAoQTWV0cmljc0FyZ3NFbnRyeRIaCgNrZXkYASABKAlCCOI/BRIDa2V5UgNrZ + XkSIAoFdmFsdWUYAiABKAlCCuI/BxIFdmFsdWVSBXZhbHVlOgI4ARr0AgoOUHJvZmlsZXJDb25maWcSTwoWc2hvdWxkX2VuYWJsZ + V9wcm9maWxlchgBIAEoCEIZ4j8WEhRzaG91bGRFbmFibGVQcm9maWxlclIUc2hvdWxkRW5hYmxlUHJvZmlsZXISPQoQcHJvZmlsZ + XJfbG9nX2RpchgCIAEoCUIT4j8QEg5wcm9maWxlckxvZ0RpclIOcHJvZmlsZXJMb2dEaXISewoNcHJvZmlsZXJfYXJncxgDIAMoC + zJDLnNuYXBjaGF0LnJlc2VhcmNoLmdibWwuR2JtbENvbmZpZy5Qcm9maWxlckNvbmZpZy5Qcm9maWxlckFyZ3NFbnRyeUIR4j8OE + gxwcm9maWxlckFyZ3NSDHByb2ZpbGVyQXJncxpVChFQcm9maWxlckFyZ3NFbnRyeRIaCgNrZXkYASABKAlCCOI/BRIDa2V5UgNrZ + XkSIAoFdmFsdWUYAiABKAlCCuI/BxIFdmFsdWVSBXZhbHVlOgI4ARpVChFGZWF0dXJlRmxhZ3NFbnRyeRIaCgNrZXkYASABKAlCC + OI/BRIDa2V5UgNrZXkSIAoFdmFsdWUYAiABKAlCCuI/BxIFdmFsdWVSBXZhbHVlOgI4AWIGcHJvdG8z""" ).mkString) lazy val scalaDescriptor: _root_.scalapb.descriptors.FileDescriptor = { val scalaProto = com.google.protobuf.descriptor.FileDescriptorProto.parseFrom(ProtoBytes) diff --git a/scala_spark35/common/src/main/scala/snapchat/research/gbml/gbml_config/GbmlConfig.scala b/scala_spark35/common/src/main/scala/snapchat/research/gbml/gbml_config/GbmlConfig.scala index 63c31ede6..b7f0507e5 100644 --- a/scala_spark35/common/src/main/scala/snapchat/research/gbml/gbml_config/GbmlConfig.scala +++ b/scala_spark35/common/src/main/scala/snapchat/research/gbml/gbml_config/GbmlConfig.scala @@ -3966,6 +3966,15 @@ object GbmlConfig extends scalapb.GeneratedMessageCompanion[snapchat.research.gb * Arguments to parameterize training process with. * @param shouldLogToTensorboard * Weather to log to tensorboard or not (defaults to false) + * @param tensorboardExperimentName + * Optional. When set, the trainer's CustomJob is submitted as a run of + * a Vertex AI Experiment with this name (instead of attaching the raw + * Tensorboard resource directly). Multiple jobs that share the same + * value land in the same backing TensorboardExperiment, so they appear + * as comparable runs on one TensorBoard page. Requires + * GiglResourceConfig...tensorboard_resource_name to be set; that TB + * becomes the experiment's backing TB. Allowed characters: lowercase + * letters, digits, hyphens (Vertex AI Experiment ID rules). */ @SerialVersionUID(0L) final case class TrainerConfig( @@ -3973,6 +3982,7 @@ object GbmlConfig extends scalapb.GeneratedMessageCompanion[snapchat.research.gb trainerArgs: _root_.scala.collection.immutable.Map[_root_.scala.Predef.String, _root_.scala.Predef.String] = _root_.scala.collection.immutable.Map.empty, executable: snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.Executable = snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.Executable.Empty, shouldLogToTensorboard: _root_.scala.Boolean = false, + tensorboardExperimentName: _root_.scala.Predef.String = "", storageConfig: snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.StorageConfig = snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.StorageConfig.Empty, unknownFields: _root_.scalapb.UnknownFieldSet = _root_.scalapb.UnknownFieldSet.empty ) extends scalapb.GeneratedMessage with scalapb.lenses.Updatable[TrainerConfig] { @@ -4006,6 +4016,13 @@ object GbmlConfig extends scalapb.GeneratedMessageCompanion[snapchat.research.gb __size += _root_.com.google.protobuf.CodedOutputStream.computeBoolSize(12, __value) } }; + + { + val __value = tensorboardExperimentName + if (!__value.isEmpty) { + __size += _root_.com.google.protobuf.CodedOutputStream.computeStringSize(14, __value) + } + }; if (storageConfig.graphStoreStorageConfig.isDefined) { val __value = storageConfig.graphStoreStorageConfig.get __size += 1 + _root_.com.google.protobuf.CodedOutputStream.computeUInt32SizeNoTag(__value.serializedSize) + __value.serializedSize @@ -4047,6 +4064,12 @@ object GbmlConfig extends scalapb.GeneratedMessageCompanion[snapchat.research.gb _output__.writeUInt32NoTag(__m.serializedSize) __m.writeTo(_output__) }; + { + val __v = tensorboardExperimentName + if (!__v.isEmpty) { + _output__.writeString(14, __v) + } + }; executable.clsPath.foreach { __v => val __m = __v _output__.writeString(100, __m) @@ -4067,6 +4090,7 @@ object GbmlConfig extends scalapb.GeneratedMessageCompanion[snapchat.research.gb def getCommand: _root_.scala.Predef.String = executable.command.getOrElse("") def withCommand(__v: _root_.scala.Predef.String): TrainerConfig = copy(executable = snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.Executable.Command(__v)) def withShouldLogToTensorboard(__v: _root_.scala.Boolean): TrainerConfig = copy(shouldLogToTensorboard = __v) + def withTensorboardExperimentName(__v: _root_.scala.Predef.String): TrainerConfig = copy(tensorboardExperimentName = __v) def getGraphStoreStorageConfig: snapchat.research.gbml.gbml_config.GbmlConfig.GraphStoreStorageConfig = storageConfig.graphStoreStorageConfig.getOrElse(snapchat.research.gbml.gbml_config.GbmlConfig.GraphStoreStorageConfig.defaultInstance) def withGraphStoreStorageConfig(__v: snapchat.research.gbml.gbml_config.GbmlConfig.GraphStoreStorageConfig): TrainerConfig = copy(storageConfig = snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.StorageConfig.GraphStoreStorageConfig(__v)) def clearExecutable: TrainerConfig = copy(executable = snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.Executable.Empty) @@ -4088,6 +4112,10 @@ object GbmlConfig extends scalapb.GeneratedMessageCompanion[snapchat.research.gb val __t = shouldLogToTensorboard if (__t != false) __t else null } + case 14 => { + val __t = tensorboardExperimentName + if (__t != "") __t else null + } case 13 => storageConfig.graphStoreStorageConfig.orNull } } @@ -4099,6 +4127,7 @@ object GbmlConfig extends scalapb.GeneratedMessageCompanion[snapchat.research.gb case 100 => executable.clsPath.map(_root_.scalapb.descriptors.PString(_)).getOrElse(_root_.scalapb.descriptors.PEmpty) case 101 => executable.command.map(_root_.scalapb.descriptors.PString(_)).getOrElse(_root_.scalapb.descriptors.PEmpty) case 12 => _root_.scalapb.descriptors.PBoolean(shouldLogToTensorboard) + case 14 => _root_.scalapb.descriptors.PString(tensorboardExperimentName) case 13 => storageConfig.graphStoreStorageConfig.map(_.toPMessage).getOrElse(_root_.scalapb.descriptors.PEmpty) } } @@ -4113,6 +4142,7 @@ object GbmlConfig extends scalapb.GeneratedMessageCompanion[snapchat.research.gb var __trainerClsPath: _root_.scala.Predef.String = "" val __trainerArgs: _root_.scala.collection.mutable.Builder[(_root_.scala.Predef.String, _root_.scala.Predef.String), _root_.scala.collection.immutable.Map[_root_.scala.Predef.String, _root_.scala.Predef.String]] = _root_.scala.collection.immutable.Map.newBuilder[_root_.scala.Predef.String, _root_.scala.Predef.String] var __shouldLogToTensorboard: _root_.scala.Boolean = false + var __tensorboardExperimentName: _root_.scala.Predef.String = "" var __executable: snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.Executable = snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.Executable.Empty var __storageConfig: snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.StorageConfig = snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.StorageConfig.Empty var `_unknownFields__`: _root_.scalapb.UnknownFieldSet.Builder = null @@ -4131,6 +4161,8 @@ object GbmlConfig extends scalapb.GeneratedMessageCompanion[snapchat.research.gb __executable = snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.Executable.Command(_input__.readStringRequireUtf8()) case 96 => __shouldLogToTensorboard = _input__.readBool() + case 114 => + __tensorboardExperimentName = _input__.readStringRequireUtf8() case 106 => __storageConfig = snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.StorageConfig.GraphStoreStorageConfig(__storageConfig.graphStoreStorageConfig.fold(_root_.scalapb.LiteParser.readMessage[snapchat.research.gbml.gbml_config.GbmlConfig.GraphStoreStorageConfig](_input__))(_root_.scalapb.LiteParser.readMessage(_input__, _))) case tag => @@ -4144,6 +4176,7 @@ object GbmlConfig extends scalapb.GeneratedMessageCompanion[snapchat.research.gb trainerClsPath = __trainerClsPath, trainerArgs = __trainerArgs.result(), shouldLogToTensorboard = __shouldLogToTensorboard, + tensorboardExperimentName = __tensorboardExperimentName, executable = __executable, storageConfig = __storageConfig, unknownFields = if (_unknownFields__ == null) _root_.scalapb.UnknownFieldSet.empty else _unknownFields__.result() @@ -4156,6 +4189,7 @@ object GbmlConfig extends scalapb.GeneratedMessageCompanion[snapchat.research.gb trainerClsPath = __fieldsMap.get(scalaDescriptor.findFieldByNumber(1).get).map(_.as[_root_.scala.Predef.String]).getOrElse(""), trainerArgs = __fieldsMap.get(scalaDescriptor.findFieldByNumber(2).get).map(_.as[_root_.scala.Seq[snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.TrainerArgsEntry]]).getOrElse(_root_.scala.Seq.empty).iterator.map(snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig._typemapper_trainerArgs.toCustom(_)).toMap, shouldLogToTensorboard = __fieldsMap.get(scalaDescriptor.findFieldByNumber(12).get).map(_.as[_root_.scala.Boolean]).getOrElse(false), + tensorboardExperimentName = __fieldsMap.get(scalaDescriptor.findFieldByNumber(14).get).map(_.as[_root_.scala.Predef.String]).getOrElse(""), executable = __fieldsMap.get(scalaDescriptor.findFieldByNumber(100).get).flatMap(_.as[_root_.scala.Option[_root_.scala.Predef.String]]).map(snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.Executable.ClsPath(_)) .orElse[snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.Executable](__fieldsMap.get(scalaDescriptor.findFieldByNumber(101).get).flatMap(_.as[_root_.scala.Option[_root_.scala.Predef.String]]).map(snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.Executable.Command(_))) .getOrElse(snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.Executable.Empty), @@ -4183,6 +4217,7 @@ object GbmlConfig extends scalapb.GeneratedMessageCompanion[snapchat.research.gb trainerClsPath = "", trainerArgs = _root_.scala.collection.immutable.Map.empty, shouldLogToTensorboard = false, + tensorboardExperimentName = "", executable = snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.Executable.Empty, storageConfig = snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.StorageConfig.Empty ) @@ -4393,6 +4428,7 @@ object GbmlConfig extends scalapb.GeneratedMessageCompanion[snapchat.research.gb def clsPath: _root_.scalapb.lenses.Lens[UpperPB, _root_.scala.Predef.String] = field(_.getClsPath)((c_, f_) => c_.copy(executable = snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.Executable.ClsPath(f_))) def command: _root_.scalapb.lenses.Lens[UpperPB, _root_.scala.Predef.String] = field(_.getCommand)((c_, f_) => c_.copy(executable = snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.Executable.Command(f_))) def shouldLogToTensorboard: _root_.scalapb.lenses.Lens[UpperPB, _root_.scala.Boolean] = field(_.shouldLogToTensorboard)((c_, f_) => c_.copy(shouldLogToTensorboard = f_)) + def tensorboardExperimentName: _root_.scalapb.lenses.Lens[UpperPB, _root_.scala.Predef.String] = field(_.tensorboardExperimentName)((c_, f_) => c_.copy(tensorboardExperimentName = f_)) def graphStoreStorageConfig: _root_.scalapb.lenses.Lens[UpperPB, snapchat.research.gbml.gbml_config.GbmlConfig.GraphStoreStorageConfig] = field(_.getGraphStoreStorageConfig)((c_, f_) => c_.copy(storageConfig = snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.StorageConfig.GraphStoreStorageConfig(f_))) def executable: _root_.scalapb.lenses.Lens[UpperPB, snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.Executable] = field(_.executable)((c_, f_) => c_.copy(executable = f_)) def storageConfig: _root_.scalapb.lenses.Lens[UpperPB, snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.StorageConfig] = field(_.storageConfig)((c_, f_) => c_.copy(storageConfig = f_)) @@ -4402,6 +4438,7 @@ object GbmlConfig extends scalapb.GeneratedMessageCompanion[snapchat.research.gb final val CLS_PATH_FIELD_NUMBER = 100 final val COMMAND_FIELD_NUMBER = 101 final val SHOULD_LOG_TO_TENSORBOARD_FIELD_NUMBER = 12 + final val TENSORBOARD_EXPERIMENT_NAME_FIELD_NUMBER = 14 final val GRAPH_STORE_STORAGE_CONFIG_FIELD_NUMBER = 13 @transient private[gbml_config] val _typemapper_trainerArgs: _root_.scalapb.TypeMapper[snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.TrainerArgsEntry, (_root_.scala.Predef.String, _root_.scala.Predef.String)] = implicitly[_root_.scalapb.TypeMapper[snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.TrainerArgsEntry, (_root_.scala.Predef.String, _root_.scala.Predef.String)]] @@ -4410,12 +4447,14 @@ object GbmlConfig extends scalapb.GeneratedMessageCompanion[snapchat.research.gb trainerArgs: _root_.scala.collection.immutable.Map[_root_.scala.Predef.String, _root_.scala.Predef.String], executable: snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.Executable, shouldLogToTensorboard: _root_.scala.Boolean, + tensorboardExperimentName: _root_.scala.Predef.String, storageConfig: snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.StorageConfig ): _root_.snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig = _root_.snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig( trainerClsPath, trainerArgs, executable, shouldLogToTensorboard, + tensorboardExperimentName, storageConfig ) // @@protoc_insertion_point(GeneratedMessageCompanion[snapchat.research.gbml.GbmlConfig.TrainerConfig]) diff --git a/scala_spark35/common/src/main/scala/snapchat/research/gbml/gbml_config/GbmlConfigProto.scala b/scala_spark35/common/src/main/scala/snapchat/research/gbml/gbml_config/GbmlConfigProto.scala index a9c35d542..5aa5dd8a7 100644 --- a/scala_spark35/common/src/main/scala/snapchat/research/gbml/gbml_config/GbmlConfigProto.scala +++ b/scala_spark35/common/src/main/scala/snapchat/research/gbml/gbml_config/GbmlConfigProto.scala @@ -26,7 +26,7 @@ object GbmlConfigProto extends _root_.scalapb.GeneratedFileObject { GhfbWV0YWRhdGEucHJvdG8aLXNuYXBjaGF0L3Jlc2VhcmNoL2dibWwvZGF0YXNldF9tZXRhZGF0YS5wcm90bxozc25hcGNoYXQvc mVzZWFyY2gvZ2JtbC90cmFpbmVkX21vZGVsX21ldGFkYXRhLnByb3RvGi9zbmFwY2hhdC9yZXNlYXJjaC9nYm1sL2luZmVyZW5jZ V9tZXRhZGF0YS5wcm90bxozc25hcGNoYXQvcmVzZWFyY2gvZ2JtbC9wb3N0cHJvY2Vzc2VkX21ldGFkYXRhLnByb3RvGjdzbmFwY - 2hhdC9yZXNlYXJjaC9nYm1sL3N1YmdyYXBoX3NhbXBsaW5nX3N0cmF0ZWd5LnByb3RvIspMCgpHYm1sQ29uZmlnEmcKDXRhc2tfb + 2hhdC9yZXNlYXJjaC9nYm1sL3N1YmdyYXBoX3NhbXBsaW5nX3N0cmF0ZWd5LnByb3RvIqpNCgpHYm1sQ29uZmlnEmcKDXRhc2tfb WV0YWRhdGEYASABKAsyLy5zbmFwY2hhdC5yZXNlYXJjaC5nYm1sLkdibWxDb25maWcuVGFza01ldGFkYXRhQhHiPw4SDHRhc2tNZ XRhZGF0YVIMdGFza01ldGFkYXRhEmAKDmdyYXBoX21ldGFkYXRhGAIgASgLMiUuc25hcGNoYXQucmVzZWFyY2guZ2JtbC5HcmFwa E1ldGFkYXRhQhLiPw8SDWdyYXBoTWV0YWRhdGFSDWdyYXBoTWV0YWRhdGESZwoNc2hhcmVkX2NvbmZpZxgDIAEoCzIvLnNuYXBja @@ -125,39 +125,40 @@ object GbmlConfigProto extends _root_.scalapb.GeneratedFileObject { BgBIAEoCUIM4j8JEgdjb21tYW5kUgdjb21tYW5kEoABCgxzdG9yYWdlX2FyZ3MYAiADKAsySy5zbmFwY2hhdC5yZXNlYXJjaC5nY m1sLkdibWxDb25maWcuR3JhcGhTdG9yZVN0b3JhZ2VDb25maWcuU3RvcmFnZUFyZ3NFbnRyeUIQ4j8NEgtzdG9yYWdlQXJnc1ILc 3RvcmFnZUFyZ3MaVAoQU3RvcmFnZUFyZ3NFbnRyeRIaCgNrZXkYASABKAlCCOI/BRIDa2V5UgNrZXkSIAoFdmFsdWUYAiABKAlCC - uI/BxIFdmFsdWVSBXZhbHVlOgI4ARqDBQoNVHJhaW5lckNvbmZpZxI9ChB0cmFpbmVyX2Nsc19wYXRoGAEgASgJQhPiPxASDnRyY + uI/BxIFdmFsdWVSBXZhbHVlOgI4ARrjBQoNVHJhaW5lckNvbmZpZxI9ChB0cmFpbmVyX2Nsc19wYXRoGAEgASgJQhPiPxASDnRyY WluZXJDbHNQYXRoUg50cmFpbmVyQ2xzUGF0aBJ2Cgx0cmFpbmVyX2FyZ3MYAiADKAsyQS5zbmFwY2hhdC5yZXNlYXJjaC5nYm1sL kdibWxDb25maWcuVHJhaW5lckNvbmZpZy5UcmFpbmVyQXJnc0VudHJ5QhDiPw0SC3RyYWluZXJBcmdzUgt0cmFpbmVyQXJncxIpC ghjbHNfcGF0aBhkIAEoCUIM4j8JEgdjbHNQYXRoSABSB2Nsc1BhdGgSKAoHY29tbWFuZBhlIAEoCUIM4j8JEgdjb21tYW5kSABSB 2NvbW1hbmQSVgoZc2hvdWxkX2xvZ190b190ZW5zb3Jib2FyZBgMIAEoCEIb4j8YEhZzaG91bGRMb2dUb1RlbnNvcmJvYXJkUhZza - G91bGRMb2dUb1RlbnNvcmJvYXJkEpcBChpncmFwaF9zdG9yZV9zdG9yYWdlX2NvbmZpZxgNIAEoCzI6LnNuYXBjaGF0LnJlc2Vhc - mNoLmdibWwuR2JtbENvbmZpZy5HcmFwaFN0b3JlU3RvcmFnZUNvbmZpZ0Ic4j8ZEhdncmFwaFN0b3JlU3RvcmFnZUNvbmZpZ0gBU - hdncmFwaFN0b3JlU3RvcmFnZUNvbmZpZxpUChBUcmFpbmVyQXJnc0VudHJ5EhoKA2tleRgBIAEoCUII4j8FEgNrZXlSA2tleRIgC - gV2YWx1ZRgCIAEoCUIK4j8HEgV2YWx1ZVIFdmFsdWU6AjgBQgwKCmV4ZWN1dGFibGVCEAoOc3RvcmFnZV9jb25maWcalQUKEEluZ - mVyZW5jZXJDb25maWcShQEKD2luZmVyZW5jZXJfYXJncxgBIAMoCzJHLnNuYXBjaGF0LnJlc2VhcmNoLmdibWwuR2JtbENvbmZpZ - y5JbmZlcmVuY2VyQ29uZmlnLkluZmVyZW5jZXJBcmdzRW50cnlCE+I/EBIOaW5mZXJlbmNlckFyZ3NSDmluZmVyZW5jZXJBcmdzE - kYKE2luZmVyZW5jZXJfY2xzX3BhdGgYAiABKAlCFuI/ExIRaW5mZXJlbmNlckNsc1BhdGhSEWluZmVyZW5jZXJDbHNQYXRoEikKC - GNsc19wYXRoGGQgASgJQgziPwkSB2Nsc1BhdGhIAFIHY2xzUGF0aBIoCgdjb21tYW5kGGUgASgJQgziPwkSB2NvbW1hbmRIAFIHY - 29tbWFuZBJJChRpbmZlcmVuY2VfYmF0Y2hfc2l6ZRgFIAEoDUIX4j8UEhJpbmZlcmVuY2VCYXRjaFNpemVSEmluZmVyZW5jZUJhd - GNoU2l6ZRKXAQoaZ3JhcGhfc3RvcmVfc3RvcmFnZV9jb25maWcYBiABKAsyOi5zbmFwY2hhdC5yZXNlYXJjaC5nYm1sLkdibWxDb - 25maWcuR3JhcGhTdG9yZVN0b3JhZ2VDb25maWdCHOI/GRIXZ3JhcGhTdG9yZVN0b3JhZ2VDb25maWdIAVIXZ3JhcGhTdG9yZVN0b - 3JhZ2VDb25maWcaVwoTSW5mZXJlbmNlckFyZ3NFbnRyeRIaCgNrZXkYASABKAlCCOI/BRIDa2V5UgNrZXkSIAoFdmFsdWUYAiABK - AlCCuI/BxIFdmFsdWVSBXZhbHVlOgI4AUIMCgpleGVjdXRhYmxlQhAKDnN0b3JhZ2VfY29uZmlnGtsCChNQb3N0UHJvY2Vzc29yQ - 29uZmlnEpUBChNwb3N0X3Byb2Nlc3Nvcl9hcmdzGAEgAygLMk0uc25hcGNoYXQucmVzZWFyY2guZ2JtbC5HYm1sQ29uZmlnLlBvc - 3RQcm9jZXNzb3JDb25maWcuUG9zdFByb2Nlc3NvckFyZ3NFbnRyeUIW4j8TEhFwb3N0UHJvY2Vzc29yQXJnc1IRcG9zdFByb2Nlc - 3NvckFyZ3MSUAoXcG9zdF9wcm9jZXNzb3JfY2xzX3BhdGgYAiABKAlCGeI/FhIUcG9zdFByb2Nlc3NvckNsc1BhdGhSFHBvc3RQc - m9jZXNzb3JDbHNQYXRoGloKFlBvc3RQcm9jZXNzb3JBcmdzRW50cnkSGgoDa2V5GAEgASgJQgjiPwUSA2tleVIDa2V5EiAKBXZhb - HVlGAIgASgJQgriPwcSBXZhbHVlUgV2YWx1ZToCOAEanAIKDU1ldHJpY3NDb25maWcSPQoQbWV0cmljc19jbHNfcGF0aBgBIAEoC - UIT4j8QEg5tZXRyaWNzQ2xzUGF0aFIObWV0cmljc0Nsc1BhdGgSdgoMbWV0cmljc19hcmdzGAIgAygLMkEuc25hcGNoYXQucmVzZ - WFyY2guZ2JtbC5HYm1sQ29uZmlnLk1ldHJpY3NDb25maWcuTWV0cmljc0FyZ3NFbnRyeUIQ4j8NEgttZXRyaWNzQXJnc1ILbWV0c - mljc0FyZ3MaVAoQTWV0cmljc0FyZ3NFbnRyeRIaCgNrZXkYASABKAlCCOI/BRIDa2V5UgNrZXkSIAoFdmFsdWUYAiABKAlCCuI/B - xIFdmFsdWVSBXZhbHVlOgI4ARr0AgoOUHJvZmlsZXJDb25maWcSTwoWc2hvdWxkX2VuYWJsZV9wcm9maWxlchgBIAEoCEIZ4j8WE - hRzaG91bGRFbmFibGVQcm9maWxlclIUc2hvdWxkRW5hYmxlUHJvZmlsZXISPQoQcHJvZmlsZXJfbG9nX2RpchgCIAEoCUIT4j8QE - g5wcm9maWxlckxvZ0RpclIOcHJvZmlsZXJMb2dEaXISewoNcHJvZmlsZXJfYXJncxgDIAMoCzJDLnNuYXBjaGF0LnJlc2VhcmNoL - mdibWwuR2JtbENvbmZpZy5Qcm9maWxlckNvbmZpZy5Qcm9maWxlckFyZ3NFbnRyeUIR4j8OEgxwcm9maWxlckFyZ3NSDHByb2Zpb - GVyQXJncxpVChFQcm9maWxlckFyZ3NFbnRyeRIaCgNrZXkYASABKAlCCOI/BRIDa2V5UgNrZXkSIAoFdmFsdWUYAiABKAlCCuI/B - xIFdmFsdWVSBXZhbHVlOgI4ARpVChFGZWF0dXJlRmxhZ3NFbnRyeRIaCgNrZXkYASABKAlCCOI/BRIDa2V5UgNrZXkSIAoFdmFsd - WUYAiABKAlCCuI/BxIFdmFsdWVSBXZhbHVlOgI4AWIGcHJvdG8z""" + G91bGRMb2dUb1RlbnNvcmJvYXJkEl4KG3RlbnNvcmJvYXJkX2V4cGVyaW1lbnRfbmFtZRgOIAEoCUIe4j8bEhl0ZW5zb3Jib2FyZ + EV4cGVyaW1lbnROYW1lUhl0ZW5zb3Jib2FyZEV4cGVyaW1lbnROYW1lEpcBChpncmFwaF9zdG9yZV9zdG9yYWdlX2NvbmZpZxgNI + AEoCzI6LnNuYXBjaGF0LnJlc2VhcmNoLmdibWwuR2JtbENvbmZpZy5HcmFwaFN0b3JlU3RvcmFnZUNvbmZpZ0Ic4j8ZEhdncmFwa + FN0b3JlU3RvcmFnZUNvbmZpZ0gBUhdncmFwaFN0b3JlU3RvcmFnZUNvbmZpZxpUChBUcmFpbmVyQXJnc0VudHJ5EhoKA2tleRgBI + AEoCUII4j8FEgNrZXlSA2tleRIgCgV2YWx1ZRgCIAEoCUIK4j8HEgV2YWx1ZVIFdmFsdWU6AjgBQgwKCmV4ZWN1dGFibGVCEAoOc + 3RvcmFnZV9jb25maWcalQUKEEluZmVyZW5jZXJDb25maWcShQEKD2luZmVyZW5jZXJfYXJncxgBIAMoCzJHLnNuYXBjaGF0LnJlc + 2VhcmNoLmdibWwuR2JtbENvbmZpZy5JbmZlcmVuY2VyQ29uZmlnLkluZmVyZW5jZXJBcmdzRW50cnlCE+I/EBIOaW5mZXJlbmNlc + kFyZ3NSDmluZmVyZW5jZXJBcmdzEkYKE2luZmVyZW5jZXJfY2xzX3BhdGgYAiABKAlCFuI/ExIRaW5mZXJlbmNlckNsc1BhdGhSE + WluZmVyZW5jZXJDbHNQYXRoEikKCGNsc19wYXRoGGQgASgJQgziPwkSB2Nsc1BhdGhIAFIHY2xzUGF0aBIoCgdjb21tYW5kGGUgA + SgJQgziPwkSB2NvbW1hbmRIAFIHY29tbWFuZBJJChRpbmZlcmVuY2VfYmF0Y2hfc2l6ZRgFIAEoDUIX4j8UEhJpbmZlcmVuY2VCY + XRjaFNpemVSEmluZmVyZW5jZUJhdGNoU2l6ZRKXAQoaZ3JhcGhfc3RvcmVfc3RvcmFnZV9jb25maWcYBiABKAsyOi5zbmFwY2hhd + C5yZXNlYXJjaC5nYm1sLkdibWxDb25maWcuR3JhcGhTdG9yZVN0b3JhZ2VDb25maWdCHOI/GRIXZ3JhcGhTdG9yZVN0b3JhZ2VDb + 25maWdIAVIXZ3JhcGhTdG9yZVN0b3JhZ2VDb25maWcaVwoTSW5mZXJlbmNlckFyZ3NFbnRyeRIaCgNrZXkYASABKAlCCOI/BRIDa + 2V5UgNrZXkSIAoFdmFsdWUYAiABKAlCCuI/BxIFdmFsdWVSBXZhbHVlOgI4AUIMCgpleGVjdXRhYmxlQhAKDnN0b3JhZ2VfY29uZ + mlnGtsCChNQb3N0UHJvY2Vzc29yQ29uZmlnEpUBChNwb3N0X3Byb2Nlc3Nvcl9hcmdzGAEgAygLMk0uc25hcGNoYXQucmVzZWFyY + 2guZ2JtbC5HYm1sQ29uZmlnLlBvc3RQcm9jZXNzb3JDb25maWcuUG9zdFByb2Nlc3NvckFyZ3NFbnRyeUIW4j8TEhFwb3N0UHJvY + 2Vzc29yQXJnc1IRcG9zdFByb2Nlc3NvckFyZ3MSUAoXcG9zdF9wcm9jZXNzb3JfY2xzX3BhdGgYAiABKAlCGeI/FhIUcG9zdFByb + 2Nlc3NvckNsc1BhdGhSFHBvc3RQcm9jZXNzb3JDbHNQYXRoGloKFlBvc3RQcm9jZXNzb3JBcmdzRW50cnkSGgoDa2V5GAEgASgJQ + gjiPwUSA2tleVIDa2V5EiAKBXZhbHVlGAIgASgJQgriPwcSBXZhbHVlUgV2YWx1ZToCOAEanAIKDU1ldHJpY3NDb25maWcSPQoQb + WV0cmljc19jbHNfcGF0aBgBIAEoCUIT4j8QEg5tZXRyaWNzQ2xzUGF0aFIObWV0cmljc0Nsc1BhdGgSdgoMbWV0cmljc19hcmdzG + AIgAygLMkEuc25hcGNoYXQucmVzZWFyY2guZ2JtbC5HYm1sQ29uZmlnLk1ldHJpY3NDb25maWcuTWV0cmljc0FyZ3NFbnRyeUIQ4 + j8NEgttZXRyaWNzQXJnc1ILbWV0cmljc0FyZ3MaVAoQTWV0cmljc0FyZ3NFbnRyeRIaCgNrZXkYASABKAlCCOI/BRIDa2V5UgNrZ + XkSIAoFdmFsdWUYAiABKAlCCuI/BxIFdmFsdWVSBXZhbHVlOgI4ARr0AgoOUHJvZmlsZXJDb25maWcSTwoWc2hvdWxkX2VuYWJsZ + V9wcm9maWxlchgBIAEoCEIZ4j8WEhRzaG91bGRFbmFibGVQcm9maWxlclIUc2hvdWxkRW5hYmxlUHJvZmlsZXISPQoQcHJvZmlsZ + XJfbG9nX2RpchgCIAEoCUIT4j8QEg5wcm9maWxlckxvZ0RpclIOcHJvZmlsZXJMb2dEaXISewoNcHJvZmlsZXJfYXJncxgDIAMoC + zJDLnNuYXBjaGF0LnJlc2VhcmNoLmdibWwuR2JtbENvbmZpZy5Qcm9maWxlckNvbmZpZy5Qcm9maWxlckFyZ3NFbnRyeUIR4j8OE + gxwcm9maWxlckFyZ3NSDHByb2ZpbGVyQXJncxpVChFQcm9maWxlckFyZ3NFbnRyeRIaCgNrZXkYASABKAlCCOI/BRIDa2V5UgNrZ + XkSIAoFdmFsdWUYAiABKAlCCuI/BxIFdmFsdWVSBXZhbHVlOgI4ARpVChFGZWF0dXJlRmxhZ3NFbnRyeRIaCgNrZXkYASABKAlCC + OI/BRIDa2V5UgNrZXkSIAoFdmFsdWUYAiABKAlCCuI/BxIFdmFsdWVSBXZhbHVlOgI4AWIGcHJvdG8z""" ).mkString) lazy val scalaDescriptor: _root_.scalapb.descriptors.FileDescriptor = { val scalaProto = com.google.protobuf.descriptor.FileDescriptorProto.parseFrom(ProtoBytes) diff --git a/snapchat/research/gbml/gbml_config_pb2.py b/snapchat/research/gbml/gbml_config_pb2.py index 8e5ac8019..bcce21dfb 100644 --- a/snapchat/research/gbml/gbml_config_pb2.py +++ b/snapchat/research/gbml/gbml_config_pb2.py @@ -21,7 +21,7 @@ from snapchat.research.gbml import subgraph_sampling_strategy_pb2 as snapchat_dot_research_dot_gbml_dot_subgraph__sampling__strategy__pb2 -DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n(snapchat/research/gbml/gbml_config.proto\x12\x16snapchat.research.gbml\x1a)snapchat/research/gbml/graph_schema.proto\x1a\x35snapchat/research/gbml/flattened_graph_metadata.proto\x1a-snapchat/research/gbml/dataset_metadata.proto\x1a\x33snapchat/research/gbml/trained_model_metadata.proto\x1a/snapchat/research/gbml/inference_metadata.proto\x1a\x33snapchat/research/gbml/postprocessed_metadata.proto\x1a\x37snapchat/research/gbml/subgraph_sampling_strategy.proto\"\x93/\n\nGbmlConfig\x12\x46\n\rtask_metadata\x18\x01 \x01(\x0b\x32/.snapchat.research.gbml.GbmlConfig.TaskMetadata\x12=\n\x0egraph_metadata\x18\x02 \x01(\x0b\x32%.snapchat.research.gbml.GraphMetadata\x12\x46\n\rshared_config\x18\x03 \x01(\x0b\x32/.snapchat.research.gbml.GbmlConfig.SharedConfig\x12H\n\x0e\x64\x61taset_config\x18\x04 \x01(\x0b\x32\x30.snapchat.research.gbml.GbmlConfig.DatasetConfig\x12H\n\x0etrainer_config\x18\x05 \x01(\x0b\x32\x30.snapchat.research.gbml.GbmlConfig.TrainerConfig\x12N\n\x11inferencer_config\x18\x06 \x01(\x0b\x32\x33.snapchat.research.gbml.GbmlConfig.InferencerConfig\x12U\n\x15post_processor_config\x18\t \x01(\x0b\x32\x36.snapchat.research.gbml.GbmlConfig.PostProcessorConfig\x12H\n\x0emetrics_config\x18\x07 \x01(\x0b\x32\x30.snapchat.research.gbml.GbmlConfig.MetricsConfig\x12J\n\x0fprofiler_config\x18\x08 \x01(\x0b\x32\x31.snapchat.research.gbml.GbmlConfig.ProfilerConfig\x12K\n\rfeature_flags\x18\n \x03(\x0b\x32\x34.snapchat.research.gbml.GbmlConfig.FeatureFlagsEntry\x1a\x8f\x05\n\x0cTaskMetadata\x12i\n\x18node_based_task_metadata\x18\x01 \x01(\x0b\x32\x45.snapchat.research.gbml.GbmlConfig.TaskMetadata.NodeBasedTaskMetadataH\x00\x12\x94\x01\n/node_anchor_based_link_prediction_task_metadata\x18\x02 \x01(\x0b\x32Y.snapchat.research.gbml.GbmlConfig.TaskMetadata.NodeAnchorBasedLinkPredictionTaskMetadataH\x00\x12i\n\x18link_based_task_metadata\x18\x03 \x01(\x0b\x32\x45.snapchat.research.gbml.GbmlConfig.TaskMetadata.LinkBasedTaskMetadataH\x00\x1a\x37\n\x15NodeBasedTaskMetadata\x12\x1e\n\x16supervision_node_types\x18\x01 \x03(\t\x1am\n)NodeAnchorBasedLinkPredictionTaskMetadata\x12@\n\x16supervision_edge_types\x18\x01 \x03(\x0b\x32 .snapchat.research.gbml.EdgeType\x1aY\n\x15LinkBasedTaskMetadata\x12@\n\x16supervision_edge_types\x18\x01 \x03(\x0b\x32 .snapchat.research.gbml.EdgeTypeB\x0f\n\rtask_metadata\x1a\x96\x06\n\x0cSharedConfig\x12!\n\x19preprocessed_metadata_uri\x18\x01 \x01(\t\x12P\n\x18\x66lattened_graph_metadata\x18\x02 \x01(\x0b\x32..snapchat.research.gbml.FlattenedGraphMetadata\x12\x41\n\x10\x64\x61taset_metadata\x18\x03 \x01(\x0b\x32\'.snapchat.research.gbml.DatasetMetadata\x12L\n\x16trained_model_metadata\x18\x04 \x01(\x0b\x32,.snapchat.research.gbml.TrainedModelMetadata\x12\x45\n\x12inference_metadata\x18\x05 \x01(\x0b\x32).snapchat.research.gbml.InferenceMetadata\x12M\n\x16postprocessed_metadata\x18\x0c \x01(\x0b\x32-.snapchat.research.gbml.PostProcessedMetadata\x12T\n\x0bshared_args\x18\x06 \x03(\x0b\x32?.snapchat.research.gbml.GbmlConfig.SharedConfig.SharedArgsEntry\x12\x19\n\x11is_graph_directed\x18\x07 \x01(\x08\x12\x1c\n\x14should_skip_training\x18\x08 \x01(\x08\x12\x30\n(should_skip_automatic_temp_asset_cleanup\x18\t \x01(\x08\x12\x1d\n\x15should_skip_inference\x18\n \x01(\x08\x12$\n\x1cshould_skip_model_evaluation\x18\x0b \x01(\x08\x12\x31\n)should_include_isolated_nodes_in_training\x18\r \x01(\x08\x1a\x31\n\x0fSharedArgsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\x1a\xd3\x0c\n\rDatasetConfig\x12i\n\x18\x64\x61ta_preprocessor_config\x18\x01 \x01(\x0b\x32G.snapchat.research.gbml.GbmlConfig.DatasetConfig.DataPreprocessorConfig\x12g\n\x17subgraph_sampler_config\x18\x02 \x01(\x0b\x32\x46.snapchat.research.gbml.GbmlConfig.DatasetConfig.SubgraphSamplerConfig\x12\x65\n\x16split_generator_config\x18\x03 \x01(\x0b\x32\x45.snapchat.research.gbml.GbmlConfig.DatasetConfig.SplitGeneratorConfig\x1a\x84\x02\n\x16\x44\x61taPreprocessorConfig\x12)\n!data_preprocessor_config_cls_path\x18\x01 \x01(\t\x12\x81\x01\n\x16\x64\x61ta_preprocessor_args\x18\x02 \x03(\x0b\x32\x61.snapchat.research.gbml.GbmlConfig.DatasetConfig.DataPreprocessorConfig.DataPreprocessorArgsEntry\x1a;\n\x19\x44\x61taPreprocessorArgsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\x1a\xd0\x04\n\x15SubgraphSamplerConfig\x12\x14\n\x08num_hops\x18\x01 \x01(\rB\x02\x18\x01\x12#\n\x17num_neighbors_to_sample\x18\x02 \x01(\x05\x42\x02\x18\x01\x12T\n\x1asubgraph_sampling_strategy\x18\n \x01(\x0b\x32\x30.snapchat.research.gbml.SubgraphSamplingStrategy\x12\x1c\n\x14num_positive_samples\x18\x03 \x01(\r\x12y\n\x12\x65xperimental_flags\x18\x05 \x03(\x0b\x32].snapchat.research.gbml.GbmlConfig.DatasetConfig.SubgraphSamplerConfig.ExperimentalFlagsEntry\x12*\n\"num_max_training_samples_to_output\x18\x06 \x01(\r\x12-\n!num_user_defined_positive_samples\x18\x07 \x01(\rB\x02\x18\x01\x12-\n!num_user_defined_negative_samples\x18\x08 \x01(\rB\x02\x18\x01\x12I\n\x0fgraph_db_config\x18\t \x01(\x0b\x32\x30.snapchat.research.gbml.GbmlConfig.GraphDBConfig\x1a\x38\n\x16\x45xperimentalFlagsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\x1a\xac\x03\n\x14SplitGeneratorConfig\x12\x1f\n\x17split_strategy_cls_path\x18\x01 \x01(\t\x12y\n\x13split_strategy_args\x18\x02 \x03(\x0b\x32\\.snapchat.research.gbml.GbmlConfig.DatasetConfig.SplitGeneratorConfig.SplitStrategyArgsEntry\x12\x19\n\x11\x61ssigner_cls_path\x18\x03 \x01(\t\x12n\n\rassigner_args\x18\x04 \x03(\x0b\x32W.snapchat.research.gbml.GbmlConfig.DatasetConfig.SplitGeneratorConfig.AssignerArgsEntry\x1a\x38\n\x16SplitStrategyArgsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\x1a\x33\n\x11\x41ssignerArgsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\x1a\x90\x04\n\rGraphDBConfig\x12#\n\x1bgraph_db_ingestion_cls_path\x18\x01 \x01(\t\x12k\n\x17graph_db_ingestion_args\x18\x02 \x03(\x0b\x32J.snapchat.research.gbml.GbmlConfig.GraphDBConfig.GraphDbIngestionArgsEntry\x12X\n\rgraph_db_args\x18\x03 \x03(\x0b\x32\x41.snapchat.research.gbml.GbmlConfig.GraphDBConfig.GraphDbArgsEntry\x12\x66\n\x17graph_db_sampler_config\x18\x04 \x01(\x0b\x32\x45.snapchat.research.gbml.GbmlConfig.GraphDBConfig.GraphDBServiceConfig\x1a;\n\x19GraphDbIngestionArgsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\x1a\x32\n\x10GraphDbArgsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\x1a:\n\x14GraphDBServiceConfig\x12\"\n\x1agraph_db_client_class_path\x18\x01 \x01(\t\x1a\xc1\x01\n\x17GraphStoreStorageConfig\x12\x0f\n\x07\x63ommand\x18\x01 \x01(\t\x12\x61\n\x0cstorage_args\x18\x02 \x03(\x0b\x32K.snapchat.research.gbml.GbmlConfig.GraphStoreStorageConfig.StorageArgsEntry\x1a\x32\n\x10StorageArgsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\x1a\x82\x03\n\rTrainerConfig\x12\x18\n\x10trainer_cls_path\x18\x01 \x01(\t\x12W\n\x0ctrainer_args\x18\x02 \x03(\x0b\x32\x41.snapchat.research.gbml.GbmlConfig.TrainerConfig.TrainerArgsEntry\x12\x12\n\x08\x63ls_path\x18\x64 \x01(\tH\x00\x12\x11\n\x07\x63ommand\x18\x65 \x01(\tH\x00\x12!\n\x19should_log_to_tensorboard\x18\x0c \x01(\x08\x12`\n\x1agraph_store_storage_config\x18\r \x01(\x0b\x32:.snapchat.research.gbml.GbmlConfig.GraphStoreStorageConfigH\x01\x1a\x32\n\x10TrainerArgsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\x42\x0c\n\nexecutableB\x10\n\x0estorage_config\x1a\x8f\x03\n\x10InferencerConfig\x12`\n\x0finferencer_args\x18\x01 \x03(\x0b\x32G.snapchat.research.gbml.GbmlConfig.InferencerConfig.InferencerArgsEntry\x12\x1b\n\x13inferencer_cls_path\x18\x02 \x01(\t\x12\x12\n\x08\x63ls_path\x18\x64 \x01(\tH\x00\x12\x11\n\x07\x63ommand\x18\x65 \x01(\tH\x00\x12\x1c\n\x14inference_batch_size\x18\x05 \x01(\r\x12`\n\x1agraph_store_storage_config\x18\x06 \x01(\x0b\x32:.snapchat.research.gbml.GbmlConfig.GraphStoreStorageConfigH\x01\x1a\x35\n\x13InferencerArgsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\x42\x0c\n\nexecutableB\x10\n\x0estorage_config\x1a\xdc\x01\n\x13PostProcessorConfig\x12j\n\x13post_processor_args\x18\x01 \x03(\x0b\x32M.snapchat.research.gbml.GbmlConfig.PostProcessorConfig.PostProcessorArgsEntry\x12\x1f\n\x17post_processor_cls_path\x18\x02 \x01(\t\x1a\x38\n\x16PostProcessorArgsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\x1a\xb6\x01\n\rMetricsConfig\x12\x18\n\x10metrics_cls_path\x18\x01 \x01(\t\x12W\n\x0cmetrics_args\x18\x02 \x03(\x0b\x32\x41.snapchat.research.gbml.GbmlConfig.MetricsConfig.MetricsArgsEntry\x1a\x32\n\x10MetricsArgsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\x1a\xdb\x01\n\x0eProfilerConfig\x12\x1e\n\x16should_enable_profiler\x18\x01 \x01(\x08\x12\x18\n\x10profiler_log_dir\x18\x02 \x01(\t\x12Z\n\rprofiler_args\x18\x03 \x03(\x0b\x32\x43.snapchat.research.gbml.GbmlConfig.ProfilerConfig.ProfilerArgsEntry\x1a\x33\n\x11ProfilerArgsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\x1a\x33\n\x11\x46\x65\x61tureFlagsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\x62\x06proto3') +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n(snapchat/research/gbml/gbml_config.proto\x12\x16snapchat.research.gbml\x1a)snapchat/research/gbml/graph_schema.proto\x1a\x35snapchat/research/gbml/flattened_graph_metadata.proto\x1a-snapchat/research/gbml/dataset_metadata.proto\x1a\x33snapchat/research/gbml/trained_model_metadata.proto\x1a/snapchat/research/gbml/inference_metadata.proto\x1a\x33snapchat/research/gbml/postprocessed_metadata.proto\x1a\x37snapchat/research/gbml/subgraph_sampling_strategy.proto\"\xb8/\n\nGbmlConfig\x12\x46\n\rtask_metadata\x18\x01 \x01(\x0b\x32/.snapchat.research.gbml.GbmlConfig.TaskMetadata\x12=\n\x0egraph_metadata\x18\x02 \x01(\x0b\x32%.snapchat.research.gbml.GraphMetadata\x12\x46\n\rshared_config\x18\x03 \x01(\x0b\x32/.snapchat.research.gbml.GbmlConfig.SharedConfig\x12H\n\x0e\x64\x61taset_config\x18\x04 \x01(\x0b\x32\x30.snapchat.research.gbml.GbmlConfig.DatasetConfig\x12H\n\x0etrainer_config\x18\x05 \x01(\x0b\x32\x30.snapchat.research.gbml.GbmlConfig.TrainerConfig\x12N\n\x11inferencer_config\x18\x06 \x01(\x0b\x32\x33.snapchat.research.gbml.GbmlConfig.InferencerConfig\x12U\n\x15post_processor_config\x18\t \x01(\x0b\x32\x36.snapchat.research.gbml.GbmlConfig.PostProcessorConfig\x12H\n\x0emetrics_config\x18\x07 \x01(\x0b\x32\x30.snapchat.research.gbml.GbmlConfig.MetricsConfig\x12J\n\x0fprofiler_config\x18\x08 \x01(\x0b\x32\x31.snapchat.research.gbml.GbmlConfig.ProfilerConfig\x12K\n\rfeature_flags\x18\n \x03(\x0b\x32\x34.snapchat.research.gbml.GbmlConfig.FeatureFlagsEntry\x1a\x8f\x05\n\x0cTaskMetadata\x12i\n\x18node_based_task_metadata\x18\x01 \x01(\x0b\x32\x45.snapchat.research.gbml.GbmlConfig.TaskMetadata.NodeBasedTaskMetadataH\x00\x12\x94\x01\n/node_anchor_based_link_prediction_task_metadata\x18\x02 \x01(\x0b\x32Y.snapchat.research.gbml.GbmlConfig.TaskMetadata.NodeAnchorBasedLinkPredictionTaskMetadataH\x00\x12i\n\x18link_based_task_metadata\x18\x03 \x01(\x0b\x32\x45.snapchat.research.gbml.GbmlConfig.TaskMetadata.LinkBasedTaskMetadataH\x00\x1a\x37\n\x15NodeBasedTaskMetadata\x12\x1e\n\x16supervision_node_types\x18\x01 \x03(\t\x1am\n)NodeAnchorBasedLinkPredictionTaskMetadata\x12@\n\x16supervision_edge_types\x18\x01 \x03(\x0b\x32 .snapchat.research.gbml.EdgeType\x1aY\n\x15LinkBasedTaskMetadata\x12@\n\x16supervision_edge_types\x18\x01 \x03(\x0b\x32 .snapchat.research.gbml.EdgeTypeB\x0f\n\rtask_metadata\x1a\x96\x06\n\x0cSharedConfig\x12!\n\x19preprocessed_metadata_uri\x18\x01 \x01(\t\x12P\n\x18\x66lattened_graph_metadata\x18\x02 \x01(\x0b\x32..snapchat.research.gbml.FlattenedGraphMetadata\x12\x41\n\x10\x64\x61taset_metadata\x18\x03 \x01(\x0b\x32\'.snapchat.research.gbml.DatasetMetadata\x12L\n\x16trained_model_metadata\x18\x04 \x01(\x0b\x32,.snapchat.research.gbml.TrainedModelMetadata\x12\x45\n\x12inference_metadata\x18\x05 \x01(\x0b\x32).snapchat.research.gbml.InferenceMetadata\x12M\n\x16postprocessed_metadata\x18\x0c \x01(\x0b\x32-.snapchat.research.gbml.PostProcessedMetadata\x12T\n\x0bshared_args\x18\x06 \x03(\x0b\x32?.snapchat.research.gbml.GbmlConfig.SharedConfig.SharedArgsEntry\x12\x19\n\x11is_graph_directed\x18\x07 \x01(\x08\x12\x1c\n\x14should_skip_training\x18\x08 \x01(\x08\x12\x30\n(should_skip_automatic_temp_asset_cleanup\x18\t \x01(\x08\x12\x1d\n\x15should_skip_inference\x18\n \x01(\x08\x12$\n\x1cshould_skip_model_evaluation\x18\x0b \x01(\x08\x12\x31\n)should_include_isolated_nodes_in_training\x18\r \x01(\x08\x1a\x31\n\x0fSharedArgsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\x1a\xd3\x0c\n\rDatasetConfig\x12i\n\x18\x64\x61ta_preprocessor_config\x18\x01 \x01(\x0b\x32G.snapchat.research.gbml.GbmlConfig.DatasetConfig.DataPreprocessorConfig\x12g\n\x17subgraph_sampler_config\x18\x02 \x01(\x0b\x32\x46.snapchat.research.gbml.GbmlConfig.DatasetConfig.SubgraphSamplerConfig\x12\x65\n\x16split_generator_config\x18\x03 \x01(\x0b\x32\x45.snapchat.research.gbml.GbmlConfig.DatasetConfig.SplitGeneratorConfig\x1a\x84\x02\n\x16\x44\x61taPreprocessorConfig\x12)\n!data_preprocessor_config_cls_path\x18\x01 \x01(\t\x12\x81\x01\n\x16\x64\x61ta_preprocessor_args\x18\x02 \x03(\x0b\x32\x61.snapchat.research.gbml.GbmlConfig.DatasetConfig.DataPreprocessorConfig.DataPreprocessorArgsEntry\x1a;\n\x19\x44\x61taPreprocessorArgsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\x1a\xd0\x04\n\x15SubgraphSamplerConfig\x12\x14\n\x08num_hops\x18\x01 \x01(\rB\x02\x18\x01\x12#\n\x17num_neighbors_to_sample\x18\x02 \x01(\x05\x42\x02\x18\x01\x12T\n\x1asubgraph_sampling_strategy\x18\n \x01(\x0b\x32\x30.snapchat.research.gbml.SubgraphSamplingStrategy\x12\x1c\n\x14num_positive_samples\x18\x03 \x01(\r\x12y\n\x12\x65xperimental_flags\x18\x05 \x03(\x0b\x32].snapchat.research.gbml.GbmlConfig.DatasetConfig.SubgraphSamplerConfig.ExperimentalFlagsEntry\x12*\n\"num_max_training_samples_to_output\x18\x06 \x01(\r\x12-\n!num_user_defined_positive_samples\x18\x07 \x01(\rB\x02\x18\x01\x12-\n!num_user_defined_negative_samples\x18\x08 \x01(\rB\x02\x18\x01\x12I\n\x0fgraph_db_config\x18\t \x01(\x0b\x32\x30.snapchat.research.gbml.GbmlConfig.GraphDBConfig\x1a\x38\n\x16\x45xperimentalFlagsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\x1a\xac\x03\n\x14SplitGeneratorConfig\x12\x1f\n\x17split_strategy_cls_path\x18\x01 \x01(\t\x12y\n\x13split_strategy_args\x18\x02 \x03(\x0b\x32\\.snapchat.research.gbml.GbmlConfig.DatasetConfig.SplitGeneratorConfig.SplitStrategyArgsEntry\x12\x19\n\x11\x61ssigner_cls_path\x18\x03 \x01(\t\x12n\n\rassigner_args\x18\x04 \x03(\x0b\x32W.snapchat.research.gbml.GbmlConfig.DatasetConfig.SplitGeneratorConfig.AssignerArgsEntry\x1a\x38\n\x16SplitStrategyArgsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\x1a\x33\n\x11\x41ssignerArgsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\x1a\x90\x04\n\rGraphDBConfig\x12#\n\x1bgraph_db_ingestion_cls_path\x18\x01 \x01(\t\x12k\n\x17graph_db_ingestion_args\x18\x02 \x03(\x0b\x32J.snapchat.research.gbml.GbmlConfig.GraphDBConfig.GraphDbIngestionArgsEntry\x12X\n\rgraph_db_args\x18\x03 \x03(\x0b\x32\x41.snapchat.research.gbml.GbmlConfig.GraphDBConfig.GraphDbArgsEntry\x12\x66\n\x17graph_db_sampler_config\x18\x04 \x01(\x0b\x32\x45.snapchat.research.gbml.GbmlConfig.GraphDBConfig.GraphDBServiceConfig\x1a;\n\x19GraphDbIngestionArgsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\x1a\x32\n\x10GraphDbArgsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\x1a:\n\x14GraphDBServiceConfig\x12\"\n\x1agraph_db_client_class_path\x18\x01 \x01(\t\x1a\xc1\x01\n\x17GraphStoreStorageConfig\x12\x0f\n\x07\x63ommand\x18\x01 \x01(\t\x12\x61\n\x0cstorage_args\x18\x02 \x03(\x0b\x32K.snapchat.research.gbml.GbmlConfig.GraphStoreStorageConfig.StorageArgsEntry\x1a\x32\n\x10StorageArgsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\x1a\xa7\x03\n\rTrainerConfig\x12\x18\n\x10trainer_cls_path\x18\x01 \x01(\t\x12W\n\x0ctrainer_args\x18\x02 \x03(\x0b\x32\x41.snapchat.research.gbml.GbmlConfig.TrainerConfig.TrainerArgsEntry\x12\x12\n\x08\x63ls_path\x18\x64 \x01(\tH\x00\x12\x11\n\x07\x63ommand\x18\x65 \x01(\tH\x00\x12!\n\x19should_log_to_tensorboard\x18\x0c \x01(\x08\x12#\n\x1btensorboard_experiment_name\x18\x0e \x01(\t\x12`\n\x1agraph_store_storage_config\x18\r \x01(\x0b\x32:.snapchat.research.gbml.GbmlConfig.GraphStoreStorageConfigH\x01\x1a\x32\n\x10TrainerArgsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\x42\x0c\n\nexecutableB\x10\n\x0estorage_config\x1a\x8f\x03\n\x10InferencerConfig\x12`\n\x0finferencer_args\x18\x01 \x03(\x0b\x32G.snapchat.research.gbml.GbmlConfig.InferencerConfig.InferencerArgsEntry\x12\x1b\n\x13inferencer_cls_path\x18\x02 \x01(\t\x12\x12\n\x08\x63ls_path\x18\x64 \x01(\tH\x00\x12\x11\n\x07\x63ommand\x18\x65 \x01(\tH\x00\x12\x1c\n\x14inference_batch_size\x18\x05 \x01(\r\x12`\n\x1agraph_store_storage_config\x18\x06 \x01(\x0b\x32:.snapchat.research.gbml.GbmlConfig.GraphStoreStorageConfigH\x01\x1a\x35\n\x13InferencerArgsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\x42\x0c\n\nexecutableB\x10\n\x0estorage_config\x1a\xdc\x01\n\x13PostProcessorConfig\x12j\n\x13post_processor_args\x18\x01 \x03(\x0b\x32M.snapchat.research.gbml.GbmlConfig.PostProcessorConfig.PostProcessorArgsEntry\x12\x1f\n\x17post_processor_cls_path\x18\x02 \x01(\t\x1a\x38\n\x16PostProcessorArgsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\x1a\xb6\x01\n\rMetricsConfig\x12\x18\n\x10metrics_cls_path\x18\x01 \x01(\t\x12W\n\x0cmetrics_args\x18\x02 \x03(\x0b\x32\x41.snapchat.research.gbml.GbmlConfig.MetricsConfig.MetricsArgsEntry\x1a\x32\n\x10MetricsArgsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\x1a\xdb\x01\n\x0eProfilerConfig\x12\x1e\n\x16should_enable_profiler\x18\x01 \x01(\x08\x12\x18\n\x10profiler_log_dir\x18\x02 \x01(\t\x12Z\n\rprofiler_args\x18\x03 \x03(\x0b\x32\x43.snapchat.research.gbml.GbmlConfig.ProfilerConfig.ProfilerArgsEntry\x1a\x33\n\x11ProfilerArgsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\x1a\x33\n\x11\x46\x65\x61tureFlagsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\x62\x06proto3') @@ -352,7 +352,7 @@ _GBMLCONFIG_FEATUREFLAGSENTRY._options = None _GBMLCONFIG_FEATUREFLAGSENTRY._serialized_options = b'8\001' _GBMLCONFIG._serialized_start=426 - _GBMLCONFIG._serialized_end=6461 + _GBMLCONFIG._serialized_end=6498 _GBMLCONFIG_TASKMETADATA._serialized_start=1190 _GBMLCONFIG_TASKMETADATA._serialized_end=1845 _GBMLCONFIG_TASKMETADATA_NODEBASEDTASKMETADATA._serialized_start=1571 @@ -394,25 +394,25 @@ _GBMLCONFIG_GRAPHSTORESTORAGECONFIG_STORAGEARGSENTRY._serialized_start=4937 _GBMLCONFIG_GRAPHSTORESTORAGECONFIG_STORAGEARGSENTRY._serialized_end=4987 _GBMLCONFIG_TRAINERCONFIG._serialized_start=4990 - _GBMLCONFIG_TRAINERCONFIG._serialized_end=5376 - _GBMLCONFIG_TRAINERCONFIG_TRAINERARGSENTRY._serialized_start=5294 - _GBMLCONFIG_TRAINERCONFIG_TRAINERARGSENTRY._serialized_end=5344 - _GBMLCONFIG_INFERENCERCONFIG._serialized_start=5379 - _GBMLCONFIG_INFERENCERCONFIG._serialized_end=5778 - _GBMLCONFIG_INFERENCERCONFIG_INFERENCERARGSENTRY._serialized_start=5693 - _GBMLCONFIG_INFERENCERCONFIG_INFERENCERARGSENTRY._serialized_end=5746 - _GBMLCONFIG_POSTPROCESSORCONFIG._serialized_start=5781 - _GBMLCONFIG_POSTPROCESSORCONFIG._serialized_end=6001 - _GBMLCONFIG_POSTPROCESSORCONFIG_POSTPROCESSORARGSENTRY._serialized_start=5945 - _GBMLCONFIG_POSTPROCESSORCONFIG_POSTPROCESSORARGSENTRY._serialized_end=6001 - _GBMLCONFIG_METRICSCONFIG._serialized_start=6004 - _GBMLCONFIG_METRICSCONFIG._serialized_end=6186 - _GBMLCONFIG_METRICSCONFIG_METRICSARGSENTRY._serialized_start=6136 - _GBMLCONFIG_METRICSCONFIG_METRICSARGSENTRY._serialized_end=6186 - _GBMLCONFIG_PROFILERCONFIG._serialized_start=6189 - _GBMLCONFIG_PROFILERCONFIG._serialized_end=6408 - _GBMLCONFIG_PROFILERCONFIG_PROFILERARGSENTRY._serialized_start=6357 - _GBMLCONFIG_PROFILERCONFIG_PROFILERARGSENTRY._serialized_end=6408 - _GBMLCONFIG_FEATUREFLAGSENTRY._serialized_start=6410 - _GBMLCONFIG_FEATUREFLAGSENTRY._serialized_end=6461 + _GBMLCONFIG_TRAINERCONFIG._serialized_end=5413 + _GBMLCONFIG_TRAINERCONFIG_TRAINERARGSENTRY._serialized_start=5331 + _GBMLCONFIG_TRAINERCONFIG_TRAINERARGSENTRY._serialized_end=5381 + _GBMLCONFIG_INFERENCERCONFIG._serialized_start=5416 + _GBMLCONFIG_INFERENCERCONFIG._serialized_end=5815 + _GBMLCONFIG_INFERENCERCONFIG_INFERENCERARGSENTRY._serialized_start=5730 + _GBMLCONFIG_INFERENCERCONFIG_INFERENCERARGSENTRY._serialized_end=5783 + _GBMLCONFIG_POSTPROCESSORCONFIG._serialized_start=5818 + _GBMLCONFIG_POSTPROCESSORCONFIG._serialized_end=6038 + _GBMLCONFIG_POSTPROCESSORCONFIG_POSTPROCESSORARGSENTRY._serialized_start=5982 + _GBMLCONFIG_POSTPROCESSORCONFIG_POSTPROCESSORARGSENTRY._serialized_end=6038 + _GBMLCONFIG_METRICSCONFIG._serialized_start=6041 + _GBMLCONFIG_METRICSCONFIG._serialized_end=6223 + _GBMLCONFIG_METRICSCONFIG_METRICSARGSENTRY._serialized_start=6173 + _GBMLCONFIG_METRICSCONFIG_METRICSARGSENTRY._serialized_end=6223 + _GBMLCONFIG_PROFILERCONFIG._serialized_start=6226 + _GBMLCONFIG_PROFILERCONFIG._serialized_end=6445 + _GBMLCONFIG_PROFILERCONFIG_PROFILERARGSENTRY._serialized_start=6394 + _GBMLCONFIG_PROFILERCONFIG_PROFILERARGSENTRY._serialized_end=6445 + _GBMLCONFIG_FEATUREFLAGSENTRY._serialized_start=6447 + _GBMLCONFIG_FEATUREFLAGSENTRY._serialized_end=6498 # @@protoc_insertion_point(module_scope) diff --git a/snapchat/research/gbml/gbml_config_pb2.pyi b/snapchat/research/gbml/gbml_config_pb2.pyi index 98d4ee693..914aaa202 100644 --- a/snapchat/research/gbml/gbml_config_pb2.pyi +++ b/snapchat/research/gbml/gbml_config_pb2.pyi @@ -542,6 +542,7 @@ class GbmlConfig(google.protobuf.message.Message): CLS_PATH_FIELD_NUMBER: builtins.int COMMAND_FIELD_NUMBER: builtins.int SHOULD_LOG_TO_TENSORBOARD_FIELD_NUMBER: builtins.int + TENSORBOARD_EXPERIMENT_NAME_FIELD_NUMBER: builtins.int GRAPH_STORE_STORAGE_CONFIG_FIELD_NUMBER: builtins.int trainer_cls_path: builtins.str """(deprecated) @@ -556,6 +557,16 @@ class GbmlConfig(google.protobuf.message.Message): """Command to use for launching trainer job""" should_log_to_tensorboard: builtins.bool """Weather to log to tensorboard or not (defaults to false)""" + tensorboard_experiment_name: builtins.str + """Optional. When set, the trainer's CustomJob is submitted as a run of + a Vertex AI Experiment with this name (instead of attaching the raw + Tensorboard resource directly). Multiple jobs that share the same + value land in the same backing TensorboardExperiment, so they appear + as comparable runs on one TensorBoard page. Requires + GiglResourceConfig...tensorboard_resource_name to be set; that TB + becomes the experiment's backing TB. Allowed characters: lowercase + letters, digits, hyphens (Vertex AI Experiment ID rules). + """ @property def graph_store_storage_config(self) -> global___GbmlConfig.GraphStoreStorageConfig: ... def __init__( @@ -566,10 +577,11 @@ class GbmlConfig(google.protobuf.message.Message): cls_path: builtins.str = ..., command: builtins.str = ..., should_log_to_tensorboard: builtins.bool = ..., + tensorboard_experiment_name: builtins.str = ..., graph_store_storage_config: global___GbmlConfig.GraphStoreStorageConfig | None = ..., ) -> None: ... def HasField(self, field_name: typing_extensions.Literal["cls_path", b"cls_path", "command", b"command", "executable", b"executable", "graph_store_storage_config", b"graph_store_storage_config", "storage_config", b"storage_config"]) -> builtins.bool: ... - def ClearField(self, field_name: typing_extensions.Literal["cls_path", b"cls_path", "command", b"command", "executable", b"executable", "graph_store_storage_config", b"graph_store_storage_config", "should_log_to_tensorboard", b"should_log_to_tensorboard", "storage_config", b"storage_config", "trainer_args", b"trainer_args", "trainer_cls_path", b"trainer_cls_path"]) -> None: ... + def ClearField(self, field_name: typing_extensions.Literal["cls_path", b"cls_path", "command", b"command", "executable", b"executable", "graph_store_storage_config", b"graph_store_storage_config", "should_log_to_tensorboard", b"should_log_to_tensorboard", "storage_config", b"storage_config", "tensorboard_experiment_name", b"tensorboard_experiment_name", "trainer_args", b"trainer_args", "trainer_cls_path", b"trainer_cls_path"]) -> None: ... @typing.overload def WhichOneof(self, oneof_group: typing_extensions.Literal["executable", b"executable"]) -> typing_extensions.Literal["cls_path", "command"] | None: ... @typing.overload From cd40efd103cfcc1d9866394bf9ea669756b8324e Mon Sep 17 00:00:00 2001 From: kmontemayor Date: Mon, 4 May 2026 16:58:34 +0000 Subject: [PATCH 21/59] validation: tensorboard_experiment_name requires tensorboard_resource_name Co-Authored-By: Claude Sonnet 4.6 --- ...nd_resource_config_compatibility_checks.py | 22 ++++++++ ...source_config_compatibility_checks_test.py | 56 +++++++++++++++++++ 2 files changed, 78 insertions(+) diff --git a/gigl/src/validation_check/libs/gbml_and_resource_config_compatibility_checks.py b/gigl/src/validation_check/libs/gbml_and_resource_config_compatibility_checks.py index 9e8d854a3..9920196d3 100644 --- a/gigl/src/validation_check/libs/gbml_and_resource_config_compatibility_checks.py +++ b/gigl/src/validation_check/libs/gbml_and_resource_config_compatibility_checks.py @@ -120,6 +120,28 @@ def check_vertex_ai_trainer_tensorboard_compatibility( "Config validation check: Vertex AI trainer TensorBoard compatibility between template and resource configs." ) + experiment_name = gbml_config_pb_wrapper.trainer_config.tensorboard_experiment_name + if experiment_name: + trainer_resource_config = resource_config_wrapper.trainer_config + if isinstance( + trainer_resource_config, gigl_resource_config_pb2.VertexAiResourceConfig + ): + tb_resource = trainer_resource_config.tensorboard_resource_name + elif isinstance( + trainer_resource_config, gigl_resource_config_pb2.VertexAiGraphStoreConfig + ): + tb_resource = ( + trainer_resource_config.compute_pool.tensorboard_resource_name + ) + else: + tb_resource = "" + assert tb_resource, ( + "GbmlConfig.trainer_config.tensorboard_experiment_name is set " + f"({experiment_name!r}) but no Vertex AI TensorBoard resource is " + "configured on the trainer resource config; the experiment needs a " + "backing TB resource." + ) + if not gbml_config_pb_wrapper.trainer_config.should_log_to_tensorboard: return diff --git a/tests/unit/src/validation/lib/gbml_and_resource_config_compatibility_checks_test.py b/tests/unit/src/validation/lib/gbml_and_resource_config_compatibility_checks_test.py index 734074987..a09fd2b86 100644 --- a/tests/unit/src/validation/lib/gbml_and_resource_config_compatibility_checks_test.py +++ b/tests/unit/src/validation/lib/gbml_and_resource_config_compatibility_checks_test.py @@ -102,6 +102,15 @@ def _create_gbml_config_with_tensorboard_enabled() -> GbmlConfigPbWrapper: return GbmlConfigPbWrapper(gbml_config_pb=gbml_config) +def _create_gbml_config_with_tensorboard_experiment_name( + experiment_name: str = "my-comparison", +) -> GbmlConfigPbWrapper: + """Create a GbmlConfig with trainer tensorboard_experiment_name set.""" + gbml_config = gbml_config_pb2.GbmlConfig() + gbml_config.trainer_config.tensorboard_experiment_name = experiment_name + return GbmlConfigPbWrapper(gbml_config_pb=gbml_config) + + def _create_resource_config_with_both_graph_stores() -> GiglResourceConfigWrapper: """Create a GiglResourceConfig with VertexAiGraphStoreConfig for both trainer and inferencer.""" config = gigl_resource_config_pb2.GiglResourceConfig() @@ -289,6 +298,53 @@ def test_resource_has_inferencer_graph_store_template_does_not(self): resource_config_wrapper=resource_config, ) + def test_experiment_name_set_without_tensorboard_resource_raises(self): + """tensorboard_experiment_name set but no TB resource → AssertionError mentioning the field.""" + gbml_config = _create_gbml_config_with_tensorboard_experiment_name( + experiment_name="my-comparison" + ) + resource_config = _create_resource_config_without_graph_stores() + + with self.assertRaises(AssertionError) as ctx: + check_vertex_ai_trainer_tensorboard_compatibility( + gbml_config_pb_wrapper=gbml_config, + resource_config_wrapper=resource_config, + ) + self.assertIn("tensorboard_experiment_name", str(ctx.exception)) + + def test_experiment_name_set_with_tensorboard_resource_does_not_raise(self): + """tensorboard_experiment_name set and TB resource present → no exception.""" + gbml_config = _create_gbml_config_with_tensorboard_experiment_name( + experiment_name="my-comparison" + ) + resource_config = _create_resource_config_with_trainer_tensorboard( + tensorboard_resource_name=( + "projects/test-project/locations/us-central1/tensorboards/test" + ) + ) + + check_vertex_ai_trainer_tensorboard_compatibility( + gbml_config_pb_wrapper=gbml_config, + resource_config_wrapper=resource_config, + ) + + def test_experiment_name_set_with_graph_store_tensorboard_resource_does_not_raise(self): + """tensorboard_experiment_name set and graph-store TB resource present → no exception.""" + gbml_config = _create_gbml_config_with_tensorboard_experiment_name( + experiment_name="my-comparison" + ) + resource_config = _create_resource_config_with_trainer_tensorboard( + tensorboard_resource_name=( + "projects/test-project/locations/us-central1/tensorboards/test" + ), + use_graph_store=True, + ) + + check_vertex_ai_trainer_tensorboard_compatibility( + gbml_config_pb_wrapper=gbml_config, + resource_config_wrapper=resource_config, + ) + if __name__ == "__main__": absltest.main() From 2a78d5a9fe154a0478258ac4a240bf0a10cff602 Mon Sep 17 00:00:00 2001 From: kmontemayor Date: Mon, 4 May 2026 17:05:46 +0000 Subject: [PATCH 22/59] vertex_ai: add tensorboard_experiment_name to VertexAiJobConfig Co-Authored-By: Claude Sonnet 4.6 --- gigl/common/services/vertex_ai.py | 6 ++++++ tests/unit/src/common/vertex_ai_test.py | 11 +++++++++++ 2 files changed, 17 insertions(+) diff --git a/gigl/common/services/vertex_ai.py b/gigl/common/services/vertex_ai.py index 4b1582aeb..d191ad6e9 100644 --- a/gigl/common/services/vertex_ai.py +++ b/gigl/common/services/vertex_ai.py @@ -140,6 +140,11 @@ class VertexAiJobConfig: ``AIP_TENSORBOARD_LOG_DIR`` from this directory. tensorboard_resource_name: Optional existing Vertex AI TensorBoard resource to attach to the job. + tensorboard_experiment_name: Optional Vertex AI Experiment name. When + set, the job is submitted with ``experiment=`` (mutually + exclusive with ``tensorboard_resource_name`` on submit; see + ``_submit_job``). Multiple jobs sharing this name appear as + comparable runs on a single TensorBoard page. """ job_name: str @@ -160,6 +165,7 @@ class VertexAiJobConfig: reservation_affinity: Optional[ReservationAffinity] = None base_output_dir: Optional[str] = None tensorboard_resource_name: Optional[str] = None + tensorboard_experiment_name: Optional[str] = None class VertexAIService: diff --git a/tests/unit/src/common/vertex_ai_test.py b/tests/unit/src/common/vertex_ai_test.py index 2ffbc8c26..457d57081 100644 --- a/tests/unit/src/common/vertex_ai_test.py +++ b/tests/unit/src/common/vertex_ai_test.py @@ -61,5 +61,16 @@ def test_submit_job_passes_tensorboard_and_base_output_dir( ) + def test_vertex_ai_job_config_carries_experiment_name(self) -> None: + cfg = VertexAiJobConfig( + job_name="job", + container_uri="gcr.io/p/img:tag", + command=["python", "-m", "x"], + tensorboard_resource_name="projects/p/locations/us/tensorboards/1", + tensorboard_experiment_name="my-comparison", + ) + self.assertEqual(cfg.tensorboard_experiment_name, "my-comparison") + + if __name__ == "__main__": absltest.main() From b6b70a77d6f4f2dae353780e36f2c6721a3a0c39 Mon Sep 17 00:00:00 2001 From: kmontemayor Date: Mon, 4 May 2026 17:12:10 +0000 Subject: [PATCH 23/59] vertex_ai: helper to ensure Experiment exists with backing TB Co-Authored-By: Claude Sonnet 4.6 --- gigl/common/services/vertex_ai.py | 41 +++++++++ tests/unit/src/common/vertex_ai_test.py | 109 ++++++++++++++++++++++++ 2 files changed, 150 insertions(+) diff --git a/gigl/common/services/vertex_ai.py b/gigl/common/services/vertex_ai.py index d191ad6e9..22b7d2b18 100644 --- a/gigl/common/services/vertex_ai.py +++ b/gigl/common/services/vertex_ai.py @@ -347,6 +347,47 @@ def launch_graph_store_job( return self._submit_job(worker_pool_specs, compute_pool_job_config) + def _ensure_experiment_with_backing_tb( + self, + experiment_name: str, + tensorboard_resource_name: str, + ) -> None: + """Ensure ``experiment_name`` exists with ``tensorboard_resource_name`` as its backing TB. + + Idempotent. Creates the Vertex AI Experiment if missing and assigns the + backing TB. If the experiment already exists with a different backing + TB, raises ``ValueError`` (silently uploading to the wrong TB would be + surprising and hard to debug). + + Args: + experiment_name: The name of the Vertex AI Experiment. + tensorboard_resource_name: The fully-qualified resource name of the + Vertex AI Tensorboard to use as the backing TB. + + Raises: + ValueError: If the experiment already exists with a different + backing tensorboard resource name. + """ + experiment = aiplatform.Experiment.get(experiment_name) + if experiment is None: + experiment = aiplatform.Experiment.create(experiment_name) + experiment.assign_backing_tensorboard(tensorboard_resource_name) + return + + backing = experiment.get_backing_tensorboard_resource() + if backing is None: + experiment.assign_backing_tensorboard(tensorboard_resource_name) + return + + if backing.resource_name != tensorboard_resource_name: + raise ValueError( + f"Vertex AI Experiment {experiment_name!r} already has a " + f"backing tensorboard {backing.resource_name!r} that does not " + f"match the configured {tensorboard_resource_name!r}. Either " + "use a fresh experiment name or update the resource config to " + "the existing backing TB." + ) + def _submit_job( self, worker_pool_specs: Union[list[WorkerPoolSpec], list[dict]], diff --git a/tests/unit/src/common/vertex_ai_test.py b/tests/unit/src/common/vertex_ai_test.py index 457d57081..7037352cd 100644 --- a/tests/unit/src/common/vertex_ai_test.py +++ b/tests/unit/src/common/vertex_ai_test.py @@ -72,5 +72,114 @@ def test_vertex_ai_job_config_carries_experiment_name(self) -> None: self.assertEqual(cfg.tensorboard_experiment_name, "my-comparison") +class TestEnsureExperimentWithBackingTb(TestCase): + """Tests for VertexAIService._ensure_experiment_with_backing_tb.""" + + _TB_RESOURCE_NAME = "projects/p/locations/us-central1/tensorboards/42" + _EXPERIMENT_NAME = "my-experiment" + + def _make_service(self, mock_init: Mock) -> VertexAIService: + return VertexAIService( + project="test-project", + location="us-central1", + service_account="svc@test.iam.gserviceaccount.com", + staging_bucket="gs://test-bucket", + ) + + @patch("gigl.common.services.vertex_ai.aiplatform.Experiment") + @patch("gigl.common.services.vertex_ai.aiplatform.init") + def test_experiment_does_not_exist_creates_and_assigns( + self, + mock_init: Mock, + mock_experiment_class: Mock, + ) -> None: + """When the experiment doesn't exist, creates it and assigns backing TB.""" + mock_experiment_class.get.return_value = None + mock_new_experiment = Mock() + mock_experiment_class.create.return_value = mock_new_experiment + + service = self._make_service(mock_init) + service._ensure_experiment_with_backing_tb( + self._EXPERIMENT_NAME, self._TB_RESOURCE_NAME + ) + + mock_experiment_class.get.assert_called_once_with(self._EXPERIMENT_NAME) + mock_experiment_class.create.assert_called_once_with(self._EXPERIMENT_NAME) + mock_new_experiment.assign_backing_tensorboard.assert_called_once_with( + self._TB_RESOURCE_NAME + ) + + @patch("gigl.common.services.vertex_ai.aiplatform.Experiment") + @patch("gigl.common.services.vertex_ai.aiplatform.init") + def test_experiment_exists_no_backing_tb_assigns( + self, + mock_init: Mock, + mock_experiment_class: Mock, + ) -> None: + """When the experiment exists with no backing TB, assigns the backing TB.""" + mock_existing_experiment = Mock() + mock_existing_experiment.get_backing_tensorboard_resource.return_value = None + mock_experiment_class.get.return_value = mock_existing_experiment + + service = self._make_service(mock_init) + service._ensure_experiment_with_backing_tb( + self._EXPERIMENT_NAME, self._TB_RESOURCE_NAME + ) + + mock_experiment_class.create.assert_not_called() + mock_existing_experiment.assign_backing_tensorboard.assert_called_once_with( + self._TB_RESOURCE_NAME + ) + + @patch("gigl.common.services.vertex_ai.aiplatform.Experiment") + @patch("gigl.common.services.vertex_ai.aiplatform.init") + def test_experiment_exists_different_backing_tb_raises( + self, + mock_init: Mock, + mock_experiment_class: Mock, + ) -> None: + """When the experiment exists with a different backing TB, raises ValueError.""" + mock_backing = Mock() + mock_backing.resource_name = "projects/p/locations/us-central1/tensorboards/99" + mock_existing_experiment = Mock() + mock_existing_experiment.get_backing_tensorboard_resource.return_value = ( + mock_backing + ) + mock_experiment_class.get.return_value = mock_existing_experiment + + service = self._make_service(mock_init) + with self.assertRaises(ValueError) as ctx: + service._ensure_experiment_with_backing_tb( + self._EXPERIMENT_NAME, self._TB_RESOURCE_NAME + ) + + self.assertIn("backing tensorboard", str(ctx.exception).lower()) + + @patch("gigl.common.services.vertex_ai.aiplatform.Experiment") + @patch("gigl.common.services.vertex_ai.aiplatform.init") + def test_experiment_exists_matching_backing_tb_is_noop( + self, + mock_init: Mock, + mock_experiment_class: Mock, + ) -> None: + """When the experiment exists with the correct backing TB, does nothing.""" + mock_backing = Mock() + mock_backing.resource_name = self._TB_RESOURCE_NAME + mock_existing_experiment = Mock() + mock_existing_experiment.get_backing_tensorboard_resource.return_value = ( + mock_backing + ) + mock_experiment_class.get.return_value = mock_existing_experiment + + service = self._make_service(mock_init) + # Should not raise and should not call assign or create + service._ensure_experiment_with_backing_tb( + self._EXPERIMENT_NAME, self._TB_RESOURCE_NAME + ) + + mock_experiment_class.create.assert_not_called() + mock_existing_experiment.assign_backing_tensorboard.assert_not_called() + + if __name__ == "__main__": absltest.main() From 4868e04e540f604d8b14c0a26be6348c662662c2 Mon Sep 17 00:00:00 2001 From: kmontemayor Date: Mon, 4 May 2026 17:23:31 +0000 Subject: [PATCH 24/59] vertex_ai: submit with experiment when tensorboard_experiment_name is set Co-Authored-By: Claude Sonnet 4.6 --- gigl/common/services/vertex_ai.py | 20 ++++++- tests/unit/src/common/vertex_ai_test.py | 78 +++++++++++++++++++++++++ 2 files changed, 95 insertions(+), 3 deletions(-) diff --git a/gigl/common/services/vertex_ai.py b/gigl/common/services/vertex_ai.py index 22b7d2b18..bc8ed47e2 100644 --- a/gigl/common/services/vertex_ai.py +++ b/gigl/common/services/vertex_ai.py @@ -62,7 +62,7 @@ def get_pipeline() -> int: # NOTE: `get_pipeline` here is the Pipeline name import datetime import time from dataclasses import dataclass -from typing import Final, Optional, Union +from typing import Any, Final, Optional, Union from google.cloud import aiplatform from google.cloud.aiplatform_v1.types import ( @@ -403,13 +403,27 @@ def _submit_job( staging_bucket=self._staging_bucket, base_output_dir=job_config.base_output_dir, ) - job.submit( + submit_kwargs: dict[str, Any] = dict( service_account=self._service_account, timeout=job_config.timeout_s, enable_web_access=job_config.enable_web_access, scheduling_strategy=job_config.scheduling_strategy, - tensorboard=job_config.tensorboard_resource_name, ) + if job_config.tensorboard_experiment_name: + if not job_config.tensorboard_resource_name: + raise ValueError( + "tensorboard_experiment_name is set but tensorboard_resource_name " + "is not; the experiment needs a backing TB resource." + ) + self._ensure_experiment_with_backing_tb( + experiment_name=job_config.tensorboard_experiment_name, + tensorboard_resource_name=job_config.tensorboard_resource_name, + ) + submit_kwargs["experiment"] = job_config.tensorboard_experiment_name + submit_kwargs["experiment_run"] = job_config.job_name + else: + submit_kwargs["tensorboard"] = job_config.tensorboard_resource_name + job.submit(**submit_kwargs) job.wait_for_resource_creation() logger.info(f"Created job: {job.resource_name}") # Copying https://github.com/googleapis/python-aiplatform/blob/v1.48.0/google/cloud/aiplatform/jobs.py#L207-L215 diff --git a/tests/unit/src/common/vertex_ai_test.py b/tests/unit/src/common/vertex_ai_test.py index 7037352cd..4bcd53937 100644 --- a/tests/unit/src/common/vertex_ai_test.py +++ b/tests/unit/src/common/vertex_ai_test.py @@ -59,6 +59,7 @@ def test_submit_job_passes_tensorboard_and_base_output_dir( submit_kwargs["tensorboard"], job_config.tensorboard_resource_name, ) + self.assertNotIn("experiment", submit_kwargs) def test_vertex_ai_job_config_carries_experiment_name(self) -> None: @@ -71,6 +72,83 @@ def test_vertex_ai_job_config_carries_experiment_name(self) -> None: ) self.assertEqual(cfg.tensorboard_experiment_name, "my-comparison") + @patch("gigl.common.services.vertex_ai.aiplatform.Experiment") + @patch("gigl.common.services.vertex_ai.aiplatform.CustomJob") + @patch("gigl.common.services.vertex_ai.aiplatform.init") + def test_submit_job_uses_experiment_when_set( + self, + mock_aiplatform_init: Mock, + mock_custom_job_class: Mock, + mock_experiment_cls: Mock, + ) -> None: + """When tensorboard_experiment_name is set, submit uses experiment= and experiment_run= instead of tensorboard=.""" + mock_exp = Mock() + mock_exp.get_backing_tensorboard_resource.return_value = Mock( + resource_name="projects/test/locations/us-central1/tensorboards/123" + ) + mock_experiment_cls.get.return_value = mock_exp + + mock_job = Mock() + mock_job.resource_name = "projects/test/locations/us-central1/customJobs/456" + mock_job.name = "456" + mock_custom_job_class.return_value = mock_job + + service = VertexAIService( + project="test-project", + location="us-central1", + service_account="svc@test-project.iam.gserviceaccount.com", + staging_bucket="gs://test-staging-bucket", + ) + + job_config = VertexAiJobConfig( + job_name="test-job-exp", + container_uri="gcr.io/test/image:latest", + command=["python", "-m", "trainer"], + base_output_dir="gs://test-perm-bucket/test-job/trainer", + tensorboard_resource_name="projects/test/locations/us-central1/tensorboards/123", + tensorboard_experiment_name="my-comparison", + ) + + service.launch_job(job_config=job_config) + + mock_job.submit.assert_called_once() + submit_kwargs = mock_job.submit.call_args.kwargs + self.assertEqual(submit_kwargs["experiment"], "my-comparison") + self.assertEqual(submit_kwargs["experiment_run"], job_config.job_name) + self.assertNotIn("tensorboard", submit_kwargs) + + @patch("gigl.common.services.vertex_ai.aiplatform.CustomJob") + @patch("gigl.common.services.vertex_ai.aiplatform.init") + def test_submit_job_raises_when_experiment_name_set_but_no_tb_resource( + self, + mock_aiplatform_init: Mock, + mock_custom_job_class: Mock, + ) -> None: + """When tensorboard_experiment_name is set but tensorboard_resource_name is empty, raises ValueError.""" + mock_job = Mock() + mock_custom_job_class.return_value = mock_job + + service = VertexAIService( + project="test-project", + location="us-central1", + service_account="svc@test-project.iam.gserviceaccount.com", + staging_bucket="gs://test-staging-bucket", + ) + + job_config = VertexAiJobConfig( + job_name="test-job-no-tb", + container_uri="gcr.io/test/image:latest", + command=["python", "-m", "trainer"], + base_output_dir="gs://test-perm-bucket/test-job/trainer", + tensorboard_resource_name="", + tensorboard_experiment_name="my-comparison", + ) + + with self.assertRaises(ValueError) as ctx: + service.launch_job(job_config=job_config) + + self.assertIn("tensorboard_resource_name", str(ctx.exception)) + class TestEnsureExperimentWithBackingTb(TestCase): """Tests for VertexAIService._ensure_experiment_with_backing_tb.""" From 9d3109e961e3d4ec39560ba8adbc1ca1887baf2a Mon Sep 17 00:00:00 2001 From: kmontemayor Date: Mon, 4 May 2026 17:32:02 +0000 Subject: [PATCH 25/59] launcher: thread tensorboard_experiment_name through _build_job_config --- gigl/src/common/vertex_ai_launcher.py | 5 ++ .../src/common/vertex_ai_launcher_test.py | 47 +++++++++++++++++++ 2 files changed, 52 insertions(+) diff --git a/gigl/src/common/vertex_ai_launcher.py b/gigl/src/common/vertex_ai_launcher.py index 86730fcde..a2c2e19c3 100644 --- a/gigl/src/common/vertex_ai_launcher.py +++ b/gigl/src/common/vertex_ai_launcher.py @@ -237,6 +237,7 @@ def _build_job_config( env_vars: list[env_var.EnvVar], labels: Optional[dict[str, str]] = None, tensorboard_logs_uri: Optional[Uri] = None, + tensorboard_experiment_name: Optional[str] = None, ) -> VertexAiJobConfig: """Build a VertexAiJobConfig for training or inference jobs. @@ -258,6 +259,9 @@ def _build_job_config( env_vars (list[env_var.EnvVar]): Environment variables to set in the container. labels (Optional[dict[str, str]]): Labels to associate with the job. Defaults to None. tensorboard_logs_uri (Optional[Uri]): TensorBoard log URI for trainer jobs. + tensorboard_experiment_name (Optional[str]): If set, the job is + submitted as a run of the named Vertex AI Experiment. See + ``VertexAiJobConfig.tensorboard_experiment_name``. Returns: VertexAiJobConfig: A configuration object ready to be used with VertexAIService.launch_job(). @@ -315,6 +319,7 @@ def _build_job_config( if base_output_dir is not None else None ), + tensorboard_experiment_name=tensorboard_experiment_name, ) return job_config diff --git a/tests/unit/src/common/vertex_ai_launcher_test.py b/tests/unit/src/common/vertex_ai_launcher_test.py index f19eb0d93..c9ee88673 100644 --- a/tests/unit/src/common/vertex_ai_launcher_test.py +++ b/tests/unit/src/common/vertex_ai_launcher_test.py @@ -10,6 +10,7 @@ GiglResourceConfigWrapper, ) from gigl.src.common.vertex_ai_launcher import ( + _build_job_config, launch_graph_store_enabled_job, launch_single_pool_job, ) @@ -335,5 +336,51 @@ def test_launch_inference_single_pool_cpu(self, mock_vertex_ai_service_class): self.assertEqual(job_config.labels, expected_labels) + def test_build_job_config_threads_experiment_name(self) -> None: + """Test that tensorboard_experiment_name is forwarded to VertexAiJobConfig.""" + resource_config = gigl_resource_config_pb2.VertexAiResourceConfig( + machine_type="n1-standard-4", + gpu_type="ACCELERATOR_TYPE_UNSPECIFIED", + gpu_limit=0, + num_replicas=1, + tensorboard_resource_name="projects/p/locations/us/tensorboards/1", + ) + cfg = _build_job_config( + job_name="job", + task_config_uri=Uri("gs://b/task.yaml"), + resource_config_uri=Uri("gs://b/resource.yaml"), + command_str="python -m gigl.src.training.v2.glt_trainer", + args={}, + use_cuda=False, + container_uri="gcr.io/p/img", + vertex_ai_resource_config=resource_config, + env_vars=[], + tensorboard_logs_uri=Uri("gs://b/run/logs/"), + tensorboard_experiment_name="my-comparison", + ) + self.assertEqual(cfg.tensorboard_experiment_name, "my-comparison") + + def test_build_job_config_experiment_name_default(self) -> None: + """Test that tensorboard_experiment_name defaults to None/empty when not provided.""" + resource_config = gigl_resource_config_pb2.VertexAiResourceConfig( + machine_type="n1-standard-4", + gpu_type="ACCELERATOR_TYPE_UNSPECIFIED", + gpu_limit=0, + num_replicas=1, + ) + cfg = _build_job_config( + job_name="job", + task_config_uri=Uri("gs://b/task.yaml"), + resource_config_uri=Uri("gs://b/resource.yaml"), + command_str="python -m gigl.src.training.v2.glt_trainer", + args={}, + use_cuda=False, + container_uri="gcr.io/p/img", + vertex_ai_resource_config=resource_config, + env_vars=[], + ) + self.assertFalse(cfg.tensorboard_experiment_name) + + if __name__ == "__main__": absltest.main() From 1ffd750757896766a105cceab28820286063b2ca Mon Sep 17 00:00:00 2001 From: kmontemayor Date: Mon, 4 May 2026 17:38:44 +0000 Subject: [PATCH 26/59] launcher: thread tensorboard_experiment_name through launch entrypoints Co-Authored-By: Claude Sonnet 4.6 --- gigl/src/common/vertex_ai_launcher.py | 12 ++ .../src/common/vertex_ai_launcher_test.py | 107 ++++++++++++++++++ 2 files changed, 119 insertions(+) diff --git a/gigl/src/common/vertex_ai_launcher.py b/gigl/src/common/vertex_ai_launcher.py index a2c2e19c3..cb83d1663 100644 --- a/gigl/src/common/vertex_ai_launcher.py +++ b/gigl/src/common/vertex_ai_launcher.py @@ -53,6 +53,7 @@ def launch_single_pool_job( component: GiGLComponents, vertex_ai_region: str, tensorboard_logs_uri: Optional[Uri] = None, + tensorboard_experiment_name: Optional[str] = None, ) -> None: """Launch a single pool job on Vertex AI. @@ -69,6 +70,10 @@ def launch_single_pool_job( component: The GiGL component (Trainer or Inferencer) vertex_ai_region: The Vertex AI region to launch the job in tensorboard_logs_uri: Optional TensorBoard log URI for trainer jobs + tensorboard_experiment_name: Optional Vertex AI Experiment name. When set, + the trainer's CustomJob is submitted as a run of the named experiment so + multiple jobs sharing the name can be compared on a single TensorBoard + page. See ``VertexAiJobConfig.tensorboard_experiment_name``. """ if component not in _LAUNCHABLE_COMPONENTS: raise ValueError( @@ -93,6 +98,7 @@ def launch_single_pool_job( env_vars=[env_var.EnvVar(name="TF_CPP_MIN_LOG_LEVEL", value="3")], labels=resource_config_wrapper.get_resource_labels(component=component), tensorboard_logs_uri=tensorboard_logs_uri, + tensorboard_experiment_name=tensorboard_experiment_name, ) logger.info(f"Launching {component.value} job with config: {job_config}") @@ -119,6 +125,7 @@ def launch_graph_store_enabled_job( cuda_docker_uri: Optional[str], component: GiGLComponents, tensorboard_logs_uri: Optional[Uri] = None, + tensorboard_experiment_name: Optional[str] = None, ) -> None: """Launch a graph store enabled job on Vertex AI with separate storage and compute pools. @@ -136,6 +143,10 @@ def launch_graph_store_enabled_job( cuda_docker_uri: Docker image URI for GPU execution component: The GiGL component (Trainer or Inferencer) tensorboard_logs_uri: Optional TensorBoard log URI for trainer jobs + tensorboard_experiment_name: Optional Vertex AI Experiment name. When set, + the trainer's CustomJob is submitted as a run of the named experiment so + multiple jobs sharing the name can be compared on a single TensorBoard + page. See ``VertexAiJobConfig.tensorboard_experiment_name``. """ if component not in _LAUNCHABLE_COMPONENTS: raise ValueError( @@ -190,6 +201,7 @@ def launch_graph_store_enabled_job( env_vars=environment_variables, labels=labels, tensorboard_logs_uri=tensorboard_logs_uri, + tensorboard_experiment_name=tensorboard_experiment_name, ) # Create storage pool job config diff --git a/tests/unit/src/common/vertex_ai_launcher_test.py b/tests/unit/src/common/vertex_ai_launcher_test.py index c9ee88673..59c69107e 100644 --- a/tests/unit/src/common/vertex_ai_launcher_test.py +++ b/tests/unit/src/common/vertex_ai_launcher_test.py @@ -336,6 +336,113 @@ def test_launch_inference_single_pool_cpu(self, mock_vertex_ai_service_class): self.assertEqual(job_config.labels, expected_labels) + @patch("gigl.src.common.vertex_ai_launcher.VertexAIService") + def test_launch_single_pool_job_threads_experiment_name( + self, mock_vertex_ai_service_class + ): + """Test that tensorboard_experiment_name is forwarded to the VertexAiJobConfig + when passed to launch_single_pool_job.""" + job_name = "test-single-pool-tb-exp" + task_config_uri = Uri("gs://bucket/task_config.yaml") + resource_config_uri = Uri("gs://bucket/resource_config.yaml") + process_command = "python -m gigl.src.training.v2.glt_trainer" + process_runtime_args: dict[str, str] = {} + cpu_docker_uri = "gcr.io/project/cpu-image:tag" + cuda_docker_uri = "gcr.io/project/cuda-image:tag" + component = GiGLComponents.Trainer + vertex_ai_region = "us-central1" + experiment_name = "my-single-pool-experiment" + + gigl_resource_config_proto = ( + _create_gigl_resource_config_with_single_pool_inference( + cost_resource_group="gigl_train" + ) + ) + resource_config_wrapper = GiglResourceConfigWrapper( + resource_config=gigl_resource_config_proto + ) + vertex_ai_config = gigl_resource_config_proto.inferencer_resource_config.vertex_ai_inferencer_config + + mock_service_instance = Mock() + mock_vertex_ai_service_class.return_value = mock_service_instance + + launch_single_pool_job( + vertex_ai_resource_config=vertex_ai_config, + job_name=job_name, + task_config_uri=task_config_uri, + resource_config_uri=resource_config_uri, + process_command=process_command, + process_runtime_args=process_runtime_args, + resource_config_wrapper=resource_config_wrapper, + cpu_docker_uri=cpu_docker_uri, + cuda_docker_uri=cuda_docker_uri, + component=component, + vertex_ai_region=vertex_ai_region, + tensorboard_logs_uri=Uri("gs://bucket/job/trainer/logs/"), + tensorboard_experiment_name=experiment_name, + ) + + mock_service_instance.launch_job.assert_called_once() + call_args = mock_service_instance.launch_job.call_args + job_config = call_args.kwargs["job_config"] + self.assertEqual(job_config.tensorboard_experiment_name, experiment_name) + + @patch("gigl.src.common.vertex_ai_launcher.VertexAIService") + def test_launch_graph_store_job_threads_experiment_name_to_compute_pool_only( + self, mock_vertex_ai_service_class + ): + """Test that tensorboard_experiment_name is forwarded to the compute pool's + VertexAiJobConfig but NOT to the storage pool's VertexAiJobConfig.""" + job_name = "test-graph-store-tb-exp" + task_config_uri = Uri("gs://bucket/task_config.yaml") + resource_config_uri = Uri("gs://bucket/resource_config.yaml") + process_command = "python -m gigl.src.training.v2.glt_trainer" + process_runtime_args: dict[str, str] = {} + cpu_docker_uri = "gcr.io/project/cpu-image:tag" + cuda_docker_uri = "gcr.io/project/cuda-image:tag" + component = GiGLComponents.Trainer + experiment_name = "my-graph-store-experiment" + + gigl_resource_config_proto = _create_gigl_resource_config_with_graph_store( + cost_resource_group="gigl_train" + ) + resource_config_wrapper = GiglResourceConfigWrapper( + resource_config=gigl_resource_config_proto + ) + graph_store_config = gigl_resource_config_proto.trainer_resource_config.vertex_ai_graph_store_trainer_config + + mock_service_instance = Mock() + mock_vertex_ai_service_class.return_value = mock_service_instance + + launch_graph_store_enabled_job( + vertex_ai_graph_store_config=graph_store_config, + job_name=job_name, + task_config_uri=task_config_uri, + resource_config_uri=resource_config_uri, + compute_commmand=process_command, + compute_runtime_args=process_runtime_args, + resource_config_wrapper=resource_config_wrapper, + storage_command="python -m gigl.distributed.graph_store.storage_main", + storage_args={}, + cpu_docker_uri=cpu_docker_uri, + cuda_docker_uri=cuda_docker_uri, + component=component, + tensorboard_logs_uri=Uri("gs://bucket/job/trainer/logs/"), + tensorboard_experiment_name=experiment_name, + ) + + mock_service_instance.launch_graph_store_job.assert_called_once() + call_args = mock_service_instance.launch_graph_store_job.call_args + compute_job_config = call_args.kwargs["compute_pool_job_config"] + storage_job_config = call_args.kwargs["storage_pool_job_config"] + + # Compute pool SHOULD have the experiment name + self.assertEqual( + compute_job_config.tensorboard_experiment_name, experiment_name + ) + # Storage pool MUST NOT have the experiment name + self.assertFalse(storage_job_config.tensorboard_experiment_name) + def test_build_job_config_threads_experiment_name(self) -> None: """Test that tensorboard_experiment_name is forwarded to VertexAiJobConfig.""" resource_config = gigl_resource_config_pb2.VertexAiResourceConfig( From 07784ed687d6365d13649ac8fb5272ef7c04bab9 Mon Sep 17 00:00:00 2001 From: kmontemayor Date: Mon, 4 May 2026 17:45:20 +0000 Subject: [PATCH 27/59] test(launcher): tighten experiment_name negative assertions to assertIsNone --- tests/unit/src/common/vertex_ai_launcher_test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/unit/src/common/vertex_ai_launcher_test.py b/tests/unit/src/common/vertex_ai_launcher_test.py index 59c69107e..dd54e1582 100644 --- a/tests/unit/src/common/vertex_ai_launcher_test.py +++ b/tests/unit/src/common/vertex_ai_launcher_test.py @@ -441,7 +441,7 @@ def test_launch_graph_store_job_threads_experiment_name_to_compute_pool_only( compute_job_config.tensorboard_experiment_name, experiment_name ) # Storage pool MUST NOT have the experiment name - self.assertFalse(storage_job_config.tensorboard_experiment_name) + self.assertIsNone(storage_job_config.tensorboard_experiment_name) def test_build_job_config_threads_experiment_name(self) -> None: """Test that tensorboard_experiment_name is forwarded to VertexAiJobConfig.""" @@ -486,7 +486,7 @@ def test_build_job_config_experiment_name_default(self) -> None: vertex_ai_resource_config=resource_config, env_vars=[], ) - self.assertFalse(cfg.tensorboard_experiment_name) + self.assertIsNone(cfg.tensorboard_experiment_name) if __name__ == "__main__": From a1d73e9e7cd797542b8fb877f22872f5df8219fe Mon Sep 17 00:00:00 2001 From: kmontemayor Date: Mon, 4 May 2026 17:57:19 +0000 Subject: [PATCH 28/59] trainer: forward TrainerConfig.tensorboard_experiment_name to launcher Co-Authored-By: Claude Sonnet 4.6 --- gigl/src/training/v1/trainer.py | 4 + gigl/src/training/v2/glt_trainer.py | 5 + tests/unit/src/training/glt_trainer_test.py | 145 ++++++++++++++++++++ tests/unit/src/training/v1_trainer_test.py | 96 +++++++++++++ 4 files changed, 250 insertions(+) create mode 100644 tests/unit/src/training/glt_trainer_test.py create mode 100644 tests/unit/src/training/v1_trainer_test.py diff --git a/gigl/src/training/v1/trainer.py b/gigl/src/training/v1/trainer.py index 0c8790663..2ecd89556 100644 --- a/gigl/src/training/v1/trainer.py +++ b/gigl/src/training/v1/trainer.py @@ -55,6 +55,9 @@ def run( if raw_tensorboard_logs_uri else None ) + tensorboard_experiment_name = ( + gbml_config_pb_wrapper.trainer_config.tensorboard_experiment_name or None + ) launch_single_pool_job( vertex_ai_resource_config=trainer_config, job_name=str(applied_task_identifier), @@ -68,6 +71,7 @@ def run( component=GiGLComponents.Trainer, vertex_ai_region=resource_config.vertex_ai_trainer_region, tensorboard_logs_uri=tensorboard_logs_uri, + tensorboard_experiment_name=tensorboard_experiment_name, ) elif isinstance(trainer_config, LocalResourceConfig): diff --git a/gigl/src/training/v2/glt_trainer.py b/gigl/src/training/v2/glt_trainer.py index 4f2ecadd1..e95ba22c8 100644 --- a/gigl/src/training/v2/glt_trainer.py +++ b/gigl/src/training/v2/glt_trainer.py @@ -60,6 +60,9 @@ def __execute_VAI_training( if raw_tensorboard_logs_uri else None ) + tensorboard_experiment_name = ( + gbml_config_pb_wrapper.trainer_config.tensorboard_experiment_name or None + ) job_name = f"gigl_train_{applied_task_identifier}" @@ -77,6 +80,7 @@ def __execute_VAI_training( component=GiGLComponents.Trainer, vertex_ai_region=resource_config.vertex_ai_trainer_region, tensorboard_logs_uri=tensorboard_logs_uri, + tensorboard_experiment_name=tensorboard_experiment_name, ) elif isinstance(resource_config.trainer_config, VertexAiGraphStoreConfig): launch_graph_store_enabled_job( @@ -93,6 +97,7 @@ def __execute_VAI_training( cuda_docker_uri=cuda_docker_uri, component=GiGLComponents.Trainer, tensorboard_logs_uri=tensorboard_logs_uri, + tensorboard_experiment_name=tensorboard_experiment_name, ) else: raise NotImplementedError( diff --git a/tests/unit/src/training/glt_trainer_test.py b/tests/unit/src/training/glt_trainer_test.py new file mode 100644 index 000000000..a6190adec --- /dev/null +++ b/tests/unit/src/training/glt_trainer_test.py @@ -0,0 +1,145 @@ +"""Unit tests for GLTTrainer — verifies tensorboard_experiment_name forwarding.""" + +from unittest.mock import MagicMock, patch + +from gigl.common import UriFactory +from gigl.src.common.types import AppliedTaskIdentifier +from gigl.src.training.v2.glt_trainer import GLTTrainer +from snapchat.research.gbml import gbml_config_pb2 +from snapchat.research.gbml import gigl_resource_config_pb2 +from tests.test_assets.test_case import TestCase + + +def _make_resource_config_wrapper_with_single_pool() -> MagicMock: + """Return a GiglResourceConfigWrapper mock backed by a VertexAiResourceConfig.""" + vertex_ai_config = gigl_resource_config_pb2.VertexAiResourceConfig( + machine_type="n1-standard-8", + num_replicas=1, + timeout=7200, + ) + mock_wrapper = MagicMock() + mock_wrapper.trainer_config = vertex_ai_config + mock_wrapper.vertex_ai_trainer_region = "us-central1" + return mock_wrapper + + +def _make_resource_config_wrapper_with_graph_store() -> MagicMock: + """Return a GiglResourceConfigWrapper mock backed by a VertexAiGraphStoreConfig.""" + compute_pool = gigl_resource_config_pb2.VertexAiResourceConfig( + machine_type="n1-standard-16", + num_replicas=1, + ) + storage_pool = gigl_resource_config_pb2.VertexAiResourceConfig( + machine_type="n1-highmem-32", + num_replicas=1, + ) + graph_store_config = gigl_resource_config_pb2.VertexAiGraphStoreConfig( + compute_pool=compute_pool, + graph_store_pool=storage_pool, + compute_cluster_local_world_size=4, + ) + mock_wrapper = MagicMock() + mock_wrapper.trainer_config = graph_store_config + return mock_wrapper + + +def _make_gbml_config_pb_wrapper(experiment_name: str = "my-comparison") -> MagicMock: + """Return a GbmlConfigPbWrapper mock with tensorboard_experiment_name set.""" + trainer_config_proto = gbml_config_pb2.GbmlConfig.TrainerConfig( + command="python -m gigl.src.training.v2.glt_trainer", + tensorboard_experiment_name=experiment_name, + ) + + mock_wrapper = MagicMock() + mock_wrapper.trainer_config = trainer_config_proto + # Ensure tensorboard_logs_uri is empty so UriFactory is not called. + mock_wrapper.shared_config.trained_model_metadata.tensorboard_logs_uri = "" + return mock_wrapper + + +class TestGltTrainerExperimentNameForwarding(TestCase): + """Tests that GLTTrainer forwards tensorboard_experiment_name to the launcher.""" + + @patch("gigl.src.training.v2.glt_trainer.launch_single_pool_job") + @patch("gigl.src.training.v2.glt_trainer.GbmlConfigPbWrapper") + @patch("gigl.src.training.v2.glt_trainer.get_resource_config") + def test_single_pool_forwards_experiment_name( + self, + mock_get_resource_config, + mock_gbml_config_cls, + mock_launch_single_pool_job, + ) -> None: + """launch_single_pool_job receives tensorboard_experiment_name='my-comparison'.""" + mock_get_resource_config.return_value = ( + _make_resource_config_wrapper_with_single_pool() + ) + mock_gbml_config_cls.get_gbml_config_pb_wrapper_from_uri.return_value = ( + _make_gbml_config_pb_wrapper("my-comparison") + ) + + trainer = GLTTrainer() + trainer.run( + applied_task_identifier=AppliedTaskIdentifier("test-job"), + task_config_uri=UriFactory.create_uri("gs://bucket/task.yaml"), + resource_config_uri=UriFactory.create_uri("gs://bucket/resource.yaml"), + ) + + mock_launch_single_pool_job.assert_called_once() + call_kwargs = mock_launch_single_pool_job.call_args.kwargs + self.assertEqual(call_kwargs["tensorboard_experiment_name"], "my-comparison") + + @patch("gigl.src.training.v2.glt_trainer.launch_graph_store_enabled_job") + @patch("gigl.src.training.v2.glt_trainer.GbmlConfigPbWrapper") + @patch("gigl.src.training.v2.glt_trainer.get_resource_config") + def test_graph_store_forwards_experiment_name( + self, + mock_get_resource_config, + mock_gbml_config_cls, + mock_launch_graph_store_enabled_job, + ) -> None: + """launch_graph_store_enabled_job receives tensorboard_experiment_name='my-comparison'.""" + mock_get_resource_config.return_value = ( + _make_resource_config_wrapper_with_graph_store() + ) + mock_gbml_config_cls.get_gbml_config_pb_wrapper_from_uri.return_value = ( + _make_gbml_config_pb_wrapper("my-comparison") + ) + + trainer = GLTTrainer() + trainer.run( + applied_task_identifier=AppliedTaskIdentifier("test-job"), + task_config_uri=UriFactory.create_uri("gs://bucket/task.yaml"), + resource_config_uri=UriFactory.create_uri("gs://bucket/resource.yaml"), + ) + + mock_launch_graph_store_enabled_job.assert_called_once() + call_kwargs = mock_launch_graph_store_enabled_job.call_args.kwargs + self.assertEqual(call_kwargs["tensorboard_experiment_name"], "my-comparison") + + @patch("gigl.src.training.v2.glt_trainer.launch_single_pool_job") + @patch("gigl.src.training.v2.glt_trainer.GbmlConfigPbWrapper") + @patch("gigl.src.training.v2.glt_trainer.get_resource_config") + def test_single_pool_empty_experiment_name_becomes_none( + self, + mock_get_resource_config, + mock_gbml_config_cls, + mock_launch_single_pool_job, + ) -> None: + """Empty string tensorboard_experiment_name is coerced to None.""" + mock_get_resource_config.return_value = ( + _make_resource_config_wrapper_with_single_pool() + ) + mock_gbml_config_cls.get_gbml_config_pb_wrapper_from_uri.return_value = ( + _make_gbml_config_pb_wrapper("") # proto default empty string + ) + + trainer = GLTTrainer() + trainer.run( + applied_task_identifier=AppliedTaskIdentifier("test-job"), + task_config_uri=UriFactory.create_uri("gs://bucket/task.yaml"), + resource_config_uri=UriFactory.create_uri("gs://bucket/resource.yaml"), + ) + + mock_launch_single_pool_job.assert_called_once() + call_kwargs = mock_launch_single_pool_job.call_args.kwargs + self.assertIsNone(call_kwargs["tensorboard_experiment_name"]) diff --git a/tests/unit/src/training/v1_trainer_test.py b/tests/unit/src/training/v1_trainer_test.py new file mode 100644 index 000000000..f253c7dfa --- /dev/null +++ b/tests/unit/src/training/v1_trainer_test.py @@ -0,0 +1,96 @@ +"""Unit tests for v1 Trainer — verifies tensorboard_experiment_name forwarding.""" + +from unittest.mock import MagicMock, patch + +from gigl.common import UriFactory +from gigl.src.common.types import AppliedTaskIdentifier +from gigl.src.training.v1.trainer import Trainer +from snapchat.research.gbml import gbml_config_pb2 +from snapchat.research.gbml import gigl_resource_config_pb2 +from tests.test_assets.test_case import TestCase + + +def _make_resource_config_wrapper_with_single_pool() -> MagicMock: + """Return a GiglResourceConfigWrapper mock backed by a VertexAiResourceConfig.""" + vertex_ai_config = gigl_resource_config_pb2.VertexAiResourceConfig( + machine_type="n1-standard-8", + num_replicas=1, + timeout=7200, + ) + mock_wrapper = MagicMock() + mock_wrapper.trainer_config = vertex_ai_config + mock_wrapper.vertex_ai_trainer_region = "us-central1" + return mock_wrapper + + +def _make_gbml_config_pb_wrapper(experiment_name: str = "my-comparison") -> MagicMock: + """Return a GbmlConfigPbWrapper mock with tensorboard_experiment_name set.""" + trainer_config_proto = gbml_config_pb2.GbmlConfig.TrainerConfig( + tensorboard_experiment_name=experiment_name, + ) + + mock_wrapper = MagicMock() + mock_wrapper.trainer_config = trainer_config_proto + # Ensure tensorboard_logs_uri is empty so UriFactory is not called. + mock_wrapper.shared_config.trained_model_metadata.tensorboard_logs_uri = "" + return mock_wrapper + + +class TestV1TrainerExperimentNameForwarding(TestCase): + """Tests that v1 Trainer forwards tensorboard_experiment_name to the launcher.""" + + @patch("gigl.src.training.v1.trainer.launch_single_pool_job") + @patch("gigl.src.training.v1.trainer.GbmlConfigPbWrapper") + @patch("gigl.src.training.v1.trainer.get_resource_config") + def test_single_pool_forwards_experiment_name( + self, + mock_get_resource_config, + mock_gbml_config_cls, + mock_launch_single_pool_job, + ) -> None: + """launch_single_pool_job receives tensorboard_experiment_name='my-comparison'.""" + mock_get_resource_config.return_value = ( + _make_resource_config_wrapper_with_single_pool() + ) + mock_gbml_config_cls.get_gbml_config_pb_wrapper_from_uri.return_value = ( + _make_gbml_config_pb_wrapper("my-comparison") + ) + + trainer = Trainer() + trainer.run( + applied_task_identifier=AppliedTaskIdentifier("test-job"), + task_config_uri=UriFactory.create_uri("gs://bucket/task.yaml"), + resource_config_uri=UriFactory.create_uri("gs://bucket/resource.yaml"), + ) + + mock_launch_single_pool_job.assert_called_once() + call_kwargs = mock_launch_single_pool_job.call_args.kwargs + self.assertEqual(call_kwargs["tensorboard_experiment_name"], "my-comparison") + + @patch("gigl.src.training.v1.trainer.launch_single_pool_job") + @patch("gigl.src.training.v1.trainer.GbmlConfigPbWrapper") + @patch("gigl.src.training.v1.trainer.get_resource_config") + def test_single_pool_empty_experiment_name_becomes_none( + self, + mock_get_resource_config, + mock_gbml_config_cls, + mock_launch_single_pool_job, + ) -> None: + """Empty string tensorboard_experiment_name is coerced to None.""" + mock_get_resource_config.return_value = ( + _make_resource_config_wrapper_with_single_pool() + ) + mock_gbml_config_cls.get_gbml_config_pb_wrapper_from_uri.return_value = ( + _make_gbml_config_pb_wrapper("") # proto default empty string + ) + + trainer = Trainer() + trainer.run( + applied_task_identifier=AppliedTaskIdentifier("test-job"), + task_config_uri=UriFactory.create_uri("gs://bucket/task.yaml"), + resource_config_uri=UriFactory.create_uri("gs://bucket/resource.yaml"), + ) + + mock_launch_single_pool_job.assert_called_once() + call_kwargs = mock_launch_single_pool_job.call_args.kwargs + self.assertIsNone(call_kwargs["tensorboard_experiment_name"]) From 8dc0b49bc155a339cb8eeaa4b6c2ccc640b9378d Mon Sep 17 00:00:00 2001 From: kmontemayor Date: Mon, 4 May 2026 18:11:01 +0000 Subject: [PATCH 29/59] examples: demo TrainerConfig.tensorboardExperimentName --- .../configs/e2e_hom_cora_sup_task_config.yaml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/examples/link_prediction/configs/e2e_hom_cora_sup_task_config.yaml b/examples/link_prediction/configs/e2e_hom_cora_sup_task_config.yaml index 606f13c29..f0088eaaf 100644 --- a/examples/link_prediction/configs/e2e_hom_cora_sup_task_config.yaml +++ b/examples/link_prediction/configs/e2e_hom_cora_sup_task_config.yaml @@ -18,6 +18,12 @@ trainerConfig: log_every_n_batch: "50" # Frequency in which we log batch information num_neighbors: "[10, 10]" # Fanout per hop, specified as a string representation of a list for the homogeneous use case command: python -m examples.link_prediction.homogeneous_training + # Optional. When set, the trainer's CustomJob is submitted as a run of the + # named Vertex AI Experiment. Multiple jobs sharing this name appear as + # comparable runs on a single TensorBoard page. Requires + # GiglResourceConfig.trainerResourceConfig...tensorboardResourceName to be + # set. See proto/snapchat/research/gbml/gbml_config.proto for details. + tensorboardExperimentName: "homogeneous-link-prediction-comparison" inferencerConfig: inferencerArgs: # Example argument to inferencer From 99ab56d8d12a590e6ce1f5eb962dbe97c887a0a8 Mon Sep 17 00:00:00 2001 From: kmontemayor Date: Mon, 4 May 2026 18:19:16 +0000 Subject: [PATCH 30/59] validation: tensorboard_experiment_name also requires tensorboard_logs_uri --- ...nd_resource_config_compatibility_checks.py | 10 ++++ ...source_config_compatibility_checks_test.py | 58 +++++++++++++++++-- 2 files changed, 63 insertions(+), 5 deletions(-) diff --git a/gigl/src/validation_check/libs/gbml_and_resource_config_compatibility_checks.py b/gigl/src/validation_check/libs/gbml_and_resource_config_compatibility_checks.py index 9920196d3..b49182c79 100644 --- a/gigl/src/validation_check/libs/gbml_and_resource_config_compatibility_checks.py +++ b/gigl/src/validation_check/libs/gbml_and_resource_config_compatibility_checks.py @@ -141,6 +141,16 @@ def check_vertex_ai_trainer_tensorboard_compatibility( "configured on the trainer resource config; the experiment needs a " "backing TB resource." ) + tb_logs_uri = ( + gbml_config_pb_wrapper.shared_config.trained_model_metadata.tensorboard_logs_uri + ) + assert tb_logs_uri, ( + "GbmlConfig.trainer_config.tensorboard_experiment_name is set " + f"({experiment_name!r}) but " + "GbmlConfig.shared_config.trained_model_metadata.tensorboard_logs_uri " + "is not; the trainer needs a log directory under the CustomJob's " + "base_output_directory for the Experiment to receive events." + ) if not gbml_config_pb_wrapper.trainer_config.should_log_to_tensorboard: return diff --git a/tests/unit/src/validation/lib/gbml_and_resource_config_compatibility_checks_test.py b/tests/unit/src/validation/lib/gbml_and_resource_config_compatibility_checks_test.py index a09fd2b86..0b8d118d2 100644 --- a/tests/unit/src/validation/lib/gbml_and_resource_config_compatibility_checks_test.py +++ b/tests/unit/src/validation/lib/gbml_and_resource_config_compatibility_checks_test.py @@ -104,10 +104,21 @@ def _create_gbml_config_with_tensorboard_enabled() -> GbmlConfigPbWrapper: def _create_gbml_config_with_tensorboard_experiment_name( experiment_name: str = "my-comparison", + tensorboard_logs_uri: str = "", ) -> GbmlConfigPbWrapper: - """Create a GbmlConfig with trainer tensorboard_experiment_name set.""" + """Create a GbmlConfig with trainer tensorboard_experiment_name set. + + Args: + experiment_name: The TensorBoard experiment name to set. + tensorboard_logs_uri: Optional GCS URI for TensorBoard logs. When non-empty, + sets ``shared_config.trained_model_metadata.tensorboard_logs_uri``. + """ gbml_config = gbml_config_pb2.GbmlConfig() gbml_config.trainer_config.tensorboard_experiment_name = experiment_name + if tensorboard_logs_uri: + gbml_config.shared_config.trained_model_metadata.tensorboard_logs_uri = ( + tensorboard_logs_uri + ) return GbmlConfigPbWrapper(gbml_config_pb=gbml_config) @@ -313,9 +324,10 @@ def test_experiment_name_set_without_tensorboard_resource_raises(self): self.assertIn("tensorboard_experiment_name", str(ctx.exception)) def test_experiment_name_set_with_tensorboard_resource_does_not_raise(self): - """tensorboard_experiment_name set and TB resource present → no exception.""" + """tensorboard_experiment_name set, TB resource present, and logs URI set → no exception.""" gbml_config = _create_gbml_config_with_tensorboard_experiment_name( - experiment_name="my-comparison" + experiment_name="my-comparison", + tensorboard_logs_uri="gs://test-bucket/run/logs/", ) resource_config = _create_resource_config_with_trainer_tensorboard( tensorboard_resource_name=( @@ -329,9 +341,10 @@ def test_experiment_name_set_with_tensorboard_resource_does_not_raise(self): ) def test_experiment_name_set_with_graph_store_tensorboard_resource_does_not_raise(self): - """tensorboard_experiment_name set and graph-store TB resource present → no exception.""" + """tensorboard_experiment_name set, graph-store TB resource present, and logs URI set → no exception.""" gbml_config = _create_gbml_config_with_tensorboard_experiment_name( - experiment_name="my-comparison" + experiment_name="my-comparison", + tensorboard_logs_uri="gs://test-bucket/run/logs/", ) resource_config = _create_resource_config_with_trainer_tensorboard( tensorboard_resource_name=( @@ -345,6 +358,41 @@ def test_experiment_name_set_with_graph_store_tensorboard_resource_does_not_rais resource_config_wrapper=resource_config, ) + def test_experiment_name_set_without_tensorboard_logs_uri_raises(self): + """tensorboard_experiment_name set and TB resource present but logs URI empty → AssertionError mentioning tensorboard_logs_uri.""" + gbml_config = _create_gbml_config_with_tensorboard_experiment_name( + experiment_name="my-comparison", + ) + resource_config = _create_resource_config_with_trainer_tensorboard( + tensorboard_resource_name=( + "projects/test-project/locations/us-central1/tensorboards/test" + ) + ) + + with self.assertRaises(AssertionError) as ctx: + check_vertex_ai_trainer_tensorboard_compatibility( + gbml_config_pb_wrapper=gbml_config, + resource_config_wrapper=resource_config, + ) + self.assertIn("tensorboard_logs_uri", str(ctx.exception)) + + def test_experiment_name_set_with_all_three_does_not_raise(self): + """tensorboard_experiment_name, tensorboard_resource_name, and tensorboard_logs_uri all set → no exception.""" + gbml_config = _create_gbml_config_with_tensorboard_experiment_name( + experiment_name="my-comparison", + tensorboard_logs_uri="gs://test-bucket/run/logs/", + ) + resource_config = _create_resource_config_with_trainer_tensorboard( + tensorboard_resource_name=( + "projects/test-project/locations/us-central1/tensorboards/test" + ) + ) + + check_vertex_ai_trainer_tensorboard_compatibility( + gbml_config_pb_wrapper=gbml_config, + resource_config_wrapper=resource_config, + ) + if __name__ == "__main__": absltest.main() From 73103ed35e0557cbe1576f2c08c4f239a1740a02 Mon Sep 17 00:00:00 2001 From: kmontemayor Date: Mon, 4 May 2026 19:07:39 +0000 Subject: [PATCH 31/59] Revert "validation: tensorboard_experiment_name also requires tensorboard_logs_uri" This reverts commit 99ab56d8d12a590e6ce1f5eb962dbe97c887a0a8. --- ...nd_resource_config_compatibility_checks.py | 10 ---- ...source_config_compatibility_checks_test.py | 58 ++----------------- 2 files changed, 5 insertions(+), 63 deletions(-) diff --git a/gigl/src/validation_check/libs/gbml_and_resource_config_compatibility_checks.py b/gigl/src/validation_check/libs/gbml_and_resource_config_compatibility_checks.py index b49182c79..9920196d3 100644 --- a/gigl/src/validation_check/libs/gbml_and_resource_config_compatibility_checks.py +++ b/gigl/src/validation_check/libs/gbml_and_resource_config_compatibility_checks.py @@ -141,16 +141,6 @@ def check_vertex_ai_trainer_tensorboard_compatibility( "configured on the trainer resource config; the experiment needs a " "backing TB resource." ) - tb_logs_uri = ( - gbml_config_pb_wrapper.shared_config.trained_model_metadata.tensorboard_logs_uri - ) - assert tb_logs_uri, ( - "GbmlConfig.trainer_config.tensorboard_experiment_name is set " - f"({experiment_name!r}) but " - "GbmlConfig.shared_config.trained_model_metadata.tensorboard_logs_uri " - "is not; the trainer needs a log directory under the CustomJob's " - "base_output_directory for the Experiment to receive events." - ) if not gbml_config_pb_wrapper.trainer_config.should_log_to_tensorboard: return diff --git a/tests/unit/src/validation/lib/gbml_and_resource_config_compatibility_checks_test.py b/tests/unit/src/validation/lib/gbml_and_resource_config_compatibility_checks_test.py index 0b8d118d2..a09fd2b86 100644 --- a/tests/unit/src/validation/lib/gbml_and_resource_config_compatibility_checks_test.py +++ b/tests/unit/src/validation/lib/gbml_and_resource_config_compatibility_checks_test.py @@ -104,21 +104,10 @@ def _create_gbml_config_with_tensorboard_enabled() -> GbmlConfigPbWrapper: def _create_gbml_config_with_tensorboard_experiment_name( experiment_name: str = "my-comparison", - tensorboard_logs_uri: str = "", ) -> GbmlConfigPbWrapper: - """Create a GbmlConfig with trainer tensorboard_experiment_name set. - - Args: - experiment_name: The TensorBoard experiment name to set. - tensorboard_logs_uri: Optional GCS URI for TensorBoard logs. When non-empty, - sets ``shared_config.trained_model_metadata.tensorboard_logs_uri``. - """ + """Create a GbmlConfig with trainer tensorboard_experiment_name set.""" gbml_config = gbml_config_pb2.GbmlConfig() gbml_config.trainer_config.tensorboard_experiment_name = experiment_name - if tensorboard_logs_uri: - gbml_config.shared_config.trained_model_metadata.tensorboard_logs_uri = ( - tensorboard_logs_uri - ) return GbmlConfigPbWrapper(gbml_config_pb=gbml_config) @@ -324,10 +313,9 @@ def test_experiment_name_set_without_tensorboard_resource_raises(self): self.assertIn("tensorboard_experiment_name", str(ctx.exception)) def test_experiment_name_set_with_tensorboard_resource_does_not_raise(self): - """tensorboard_experiment_name set, TB resource present, and logs URI set → no exception.""" + """tensorboard_experiment_name set and TB resource present → no exception.""" gbml_config = _create_gbml_config_with_tensorboard_experiment_name( - experiment_name="my-comparison", - tensorboard_logs_uri="gs://test-bucket/run/logs/", + experiment_name="my-comparison" ) resource_config = _create_resource_config_with_trainer_tensorboard( tensorboard_resource_name=( @@ -341,10 +329,9 @@ def test_experiment_name_set_with_tensorboard_resource_does_not_raise(self): ) def test_experiment_name_set_with_graph_store_tensorboard_resource_does_not_raise(self): - """tensorboard_experiment_name set, graph-store TB resource present, and logs URI set → no exception.""" + """tensorboard_experiment_name set and graph-store TB resource present → no exception.""" gbml_config = _create_gbml_config_with_tensorboard_experiment_name( - experiment_name="my-comparison", - tensorboard_logs_uri="gs://test-bucket/run/logs/", + experiment_name="my-comparison" ) resource_config = _create_resource_config_with_trainer_tensorboard( tensorboard_resource_name=( @@ -358,41 +345,6 @@ def test_experiment_name_set_with_graph_store_tensorboard_resource_does_not_rais resource_config_wrapper=resource_config, ) - def test_experiment_name_set_without_tensorboard_logs_uri_raises(self): - """tensorboard_experiment_name set and TB resource present but logs URI empty → AssertionError mentioning tensorboard_logs_uri.""" - gbml_config = _create_gbml_config_with_tensorboard_experiment_name( - experiment_name="my-comparison", - ) - resource_config = _create_resource_config_with_trainer_tensorboard( - tensorboard_resource_name=( - "projects/test-project/locations/us-central1/tensorboards/test" - ) - ) - - with self.assertRaises(AssertionError) as ctx: - check_vertex_ai_trainer_tensorboard_compatibility( - gbml_config_pb_wrapper=gbml_config, - resource_config_wrapper=resource_config, - ) - self.assertIn("tensorboard_logs_uri", str(ctx.exception)) - - def test_experiment_name_set_with_all_three_does_not_raise(self): - """tensorboard_experiment_name, tensorboard_resource_name, and tensorboard_logs_uri all set → no exception.""" - gbml_config = _create_gbml_config_with_tensorboard_experiment_name( - experiment_name="my-comparison", - tensorboard_logs_uri="gs://test-bucket/run/logs/", - ) - resource_config = _create_resource_config_with_trainer_tensorboard( - tensorboard_resource_name=( - "projects/test-project/locations/us-central1/tensorboards/test" - ) - ) - - check_vertex_ai_trainer_tensorboard_compatibility( - gbml_config_pb_wrapper=gbml_config, - resource_config_wrapper=resource_config, - ) - if __name__ == "__main__": absltest.main() From fcc871d4bd7fdf992647260a443a1e50fef287a2 Mon Sep 17 00:00:00 2001 From: kmontemayor Date: Mon, 4 May 2026 20:39:11 +0000 Subject: [PATCH 32/59] vertex_ai: sanitize job_name for ExperimentRun ID, validate experiment_name Vertex AI MetadataStore Context IDs (which back ExperimentRuns) must match [a-z0-9][a-z0-9-]{0,127}. GiGL's gigl_train__ job names contain underscores and were rejected with a 400 from the SDK. Lowercase and replace underscores with hyphens for the run ID; fail fast with a clear message when the user-supplied tensorboard_experiment_name itself is invalid (don't silently rewrite user input). --- gigl/common/services/vertex_ai.py | 46 +++++++++- tests/unit/src/common/vertex_ai_test.py | 116 +++++++++++++++++++++++- 2 files changed, 160 insertions(+), 2 deletions(-) diff --git a/gigl/common/services/vertex_ai.py b/gigl/common/services/vertex_ai.py index bc8ed47e2..a44e3092d 100644 --- a/gigl/common/services/vertex_ai.py +++ b/gigl/common/services/vertex_ai.py @@ -60,6 +60,7 @@ def get_pipeline() -> int: # NOTE: `get_pipeline` here is the Pipeline name """ import datetime +import re import time from dataclasses import dataclass from typing import Any, Final, Optional, Union @@ -87,6 +88,39 @@ def get_pipeline() -> int: # NOTE: `get_pipeline` here is the Pipeline name DEFAULT_PIPELINE_TIMEOUT_S: Final[int] = 60 * 60 * 36 # 36 hours DEFAULT_CUSTOM_JOB_TIMEOUT_S: Final[int] = 60 * 60 * 24 # 24 hours +# Vertex AI Experiment / ExperimentRun resource IDs are MetadataStore Context +# IDs and must satisfy this regex (the SDK builds the run's resource ID as +# ``-``, so each part must individually conform). +_VERTEX_RESOURCE_ID_PATTERN: Final[re.Pattern[str]] = re.compile( + r"^[a-z0-9][a-z0-9-]{0,127}$" +) + + +def _sanitize_vertex_run_id(value: str) -> str: + """Coerce a GiGL job name to a valid Vertex AI ExperimentRun ID. + + Lowercases the string and replaces ``_`` with ``-``. Validates the result. + + Args: + value: A job name (typically GiGL ``gigl_train_...`` style). + + Returns: + A string matching ``[a-z0-9][a-z0-9-]{0,127}``. + + Raises: + ValueError: If sanitization can't produce a valid Vertex AI resource ID + (e.g. the input contains characters other than ``[A-Za-z0-9_-]`` or + is too long). + """ + sanitized = value.lower().replace("_", "-") + if not _VERTEX_RESOURCE_ID_PATTERN.match(sanitized): + raise ValueError( + f"Cannot derive a valid Vertex AI ExperimentRun ID from {value!r}; " + f"after lowercasing and replacing underscores got {sanitized!r}, " + f"which does not match {_VERTEX_RESOURCE_ID_PATTERN.pattern}." + ) + return sanitized + @dataclass class VertexAiJobConfig: @@ -415,12 +449,22 @@ def _submit_job( "tensorboard_experiment_name is set but tensorboard_resource_name " "is not; the experiment needs a backing TB resource." ) + if not _VERTEX_RESOURCE_ID_PATTERN.match( + job_config.tensorboard_experiment_name + ): + raise ValueError( + f"tensorboard_experiment_name {job_config.tensorboard_experiment_name!r} " + f"is not a valid Vertex AI Experiment ID; it must match " + f"{_VERTEX_RESOURCE_ID_PATTERN.pattern}." + ) self._ensure_experiment_with_backing_tb( experiment_name=job_config.tensorboard_experiment_name, tensorboard_resource_name=job_config.tensorboard_resource_name, ) submit_kwargs["experiment"] = job_config.tensorboard_experiment_name - submit_kwargs["experiment_run"] = job_config.job_name + submit_kwargs["experiment_run"] = _sanitize_vertex_run_id( + job_config.job_name + ) else: submit_kwargs["tensorboard"] = job_config.tensorboard_resource_name job.submit(**submit_kwargs) diff --git a/tests/unit/src/common/vertex_ai_test.py b/tests/unit/src/common/vertex_ai_test.py index 4bcd53937..1f36c7463 100644 --- a/tests/unit/src/common/vertex_ai_test.py +++ b/tests/unit/src/common/vertex_ai_test.py @@ -4,7 +4,11 @@ from absl.testing import absltest -from gigl.common.services.vertex_ai import VertexAiJobConfig, VertexAIService +from gigl.common.services.vertex_ai import ( + VertexAiJobConfig, + VertexAIService, + _sanitize_vertex_run_id, +) from tests.test_assets.test_case import TestCase @@ -259,5 +263,115 @@ def test_experiment_exists_matching_backing_tb_is_noop( mock_existing_experiment.assign_backing_tensorboard.assert_not_called() +class TestSanitizeVertexRunId(TestCase): + """Tests for the _sanitize_vertex_run_id helper.""" + + def test_lowercases_and_replaces_underscores(self) -> None: + self.assertEqual( + _sanitize_vertex_run_id("gigl_train_hom_cora_sup_test_on_20260504_192055"), + "gigl-train-hom-cora-sup-test-on-20260504-192055", + ) + + def test_already_valid_passthrough(self) -> None: + self.assertEqual( + _sanitize_vertex_run_id("already-valid-id-123"), + "already-valid-id-123", + ) + + def test_uppercase_lowered(self) -> None: + self.assertEqual(_sanitize_vertex_run_id("MyJob"), "myjob") + + def test_invalid_chars_after_sanitization_raises(self) -> None: + with self.assertRaises(ValueError) as ctx: + _sanitize_vertex_run_id("has spaces and !@#") + self.assertIn("Vertex AI ExperimentRun ID", str(ctx.exception)) + + def test_leading_hyphen_rejected(self) -> None: + # Underscores at the start become hyphens, which violates the regex + # (the first character must be alphanumeric). + with self.assertRaises(ValueError): + _sanitize_vertex_run_id("_leading_underscore") + + +class TestSubmitJobSanitizesRunId(TestCase): + """Tests that _submit_job applies sanitization and validation.""" + + @patch("gigl.common.services.vertex_ai.aiplatform.Experiment") + @patch("gigl.common.services.vertex_ai.aiplatform.CustomJob") + @patch("gigl.common.services.vertex_ai.aiplatform.init") + def test_underscored_job_name_is_sanitized_for_experiment_run( + self, + mock_aiplatform_init: Mock, + mock_custom_job_class: Mock, + mock_experiment_cls: Mock, + ) -> None: + """The GiGL ``gigl_train_...`` job name must be coerced to a valid Vertex AI run ID.""" + mock_exp = Mock() + mock_exp.get_backing_tensorboard_resource.return_value = Mock( + resource_name="projects/test/locations/us-central1/tensorboards/123" + ) + mock_experiment_cls.get.return_value = mock_exp + + mock_job = Mock() + mock_job.resource_name = "projects/test/locations/us-central1/customJobs/456" + mock_job.name = "456" + mock_custom_job_class.return_value = mock_job + + service = VertexAIService( + project="test-project", + location="us-central1", + service_account="svc@test-project.iam.gserviceaccount.com", + staging_bucket="gs://test-staging-bucket", + ) + + job_config = VertexAiJobConfig( + job_name="gigl_train_my_run_20260504", + container_uri="gcr.io/test/image:latest", + command=["python", "-m", "trainer"], + base_output_dir="gs://test-perm-bucket/run/trainer", + tensorboard_resource_name="projects/test/locations/us-central1/tensorboards/123", + tensorboard_experiment_name="my-comparison", + ) + + service.launch_job(job_config=job_config) + + submit_kwargs = mock_job.submit.call_args.kwargs + self.assertEqual( + submit_kwargs["experiment_run"], "gigl-train-my-run-20260504" + ) + + @patch("gigl.common.services.vertex_ai.aiplatform.CustomJob") + @patch("gigl.common.services.vertex_ai.aiplatform.init") + def test_invalid_experiment_name_raises( + self, + mock_aiplatform_init: Mock, + mock_custom_job_class: Mock, + ) -> None: + """User-supplied tensorboard_experiment_name must match Vertex's regex.""" + mock_job = Mock() + mock_custom_job_class.return_value = mock_job + + service = VertexAIService( + project="test-project", + location="us-central1", + service_account="svc@test-project.iam.gserviceaccount.com", + staging_bucket="gs://test-staging-bucket", + ) + + job_config = VertexAiJobConfig( + job_name="any-job", + container_uri="gcr.io/test/image:latest", + command=["python", "-m", "trainer"], + base_output_dir="gs://test-perm-bucket/run/trainer", + tensorboard_resource_name="projects/test/locations/us-central1/tensorboards/123", + tensorboard_experiment_name="Invalid_Experiment_Name", + ) + + with self.assertRaises(ValueError) as ctx: + service.launch_job(job_config=job_config) + + self.assertIn("tensorboard_experiment_name", str(ctx.exception)) + + if __name__ == "__main__": absltest.main() From be5bbf03d1f242c70bdbf2957b26829d12710959 Mon Sep 17 00:00:00 2001 From: kmontemayor Date: Mon, 4 May 2026 21:18:04 +0000 Subject: [PATCH 33/59] vertex_ai: drop experiment_run kwarg, let SDK auto-generate the run MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Passing experiment_run= invokes aiplatform.ExperimentRun's getter, which 404s when the run doesn't exist yet (every first submit does). Per the SDK's own contract — "If 'experiment' is set but 'experiment_run' is not, an ExperimentRun resource will still be auto-generated" (jobs.py:2509-2514) — the right move is to set only experiment= and let the SDK create the run. The user-supplied experiment name is still validated up front against Vertex's resource-ID regex; the run-id sanitizer is no longer needed. --- gigl/common/services/vertex_ai.py | 38 ++--------- tests/unit/src/common/vertex_ai_test.py | 87 ++----------------------- 2 files changed, 11 insertions(+), 114 deletions(-) diff --git a/gigl/common/services/vertex_ai.py b/gigl/common/services/vertex_ai.py index a44e3092d..4ad0bface 100644 --- a/gigl/common/services/vertex_ai.py +++ b/gigl/common/services/vertex_ai.py @@ -88,40 +88,13 @@ def get_pipeline() -> int: # NOTE: `get_pipeline` here is the Pipeline name DEFAULT_PIPELINE_TIMEOUT_S: Final[int] = 60 * 60 * 36 # 36 hours DEFAULT_CUSTOM_JOB_TIMEOUT_S: Final[int] = 60 * 60 * 24 # 24 hours -# Vertex AI Experiment / ExperimentRun resource IDs are MetadataStore Context -# IDs and must satisfy this regex (the SDK builds the run's resource ID as -# ``-``, so each part must individually conform). +# Vertex AI Experiment IDs are MetadataStore Context IDs and must satisfy +# this regex. _VERTEX_RESOURCE_ID_PATTERN: Final[re.Pattern[str]] = re.compile( r"^[a-z0-9][a-z0-9-]{0,127}$" ) -def _sanitize_vertex_run_id(value: str) -> str: - """Coerce a GiGL job name to a valid Vertex AI ExperimentRun ID. - - Lowercases the string and replaces ``_`` with ``-``. Validates the result. - - Args: - value: A job name (typically GiGL ``gigl_train_...`` style). - - Returns: - A string matching ``[a-z0-9][a-z0-9-]{0,127}``. - - Raises: - ValueError: If sanitization can't produce a valid Vertex AI resource ID - (e.g. the input contains characters other than ``[A-Za-z0-9_-]`` or - is too long). - """ - sanitized = value.lower().replace("_", "-") - if not _VERTEX_RESOURCE_ID_PATTERN.match(sanitized): - raise ValueError( - f"Cannot derive a valid Vertex AI ExperimentRun ID from {value!r}; " - f"after lowercasing and replacing underscores got {sanitized!r}, " - f"which does not match {_VERTEX_RESOURCE_ID_PATTERN.pattern}." - ) - return sanitized - - @dataclass class VertexAiJobConfig: """Configuration for a Vertex AI CustomJob worker pool. @@ -461,10 +434,11 @@ def _submit_job( experiment_name=job_config.tensorboard_experiment_name, tensorboard_resource_name=job_config.tensorboard_resource_name, ) + # Don't pass experiment_run: when experiment is set but + # experiment_run is not, the SDK auto-generates an ExperimentRun + # for this job. Passing a name here invokes the run *getter*, + # which 404s for a not-yet-created run. submit_kwargs["experiment"] = job_config.tensorboard_experiment_name - submit_kwargs["experiment_run"] = _sanitize_vertex_run_id( - job_config.job_name - ) else: submit_kwargs["tensorboard"] = job_config.tensorboard_resource_name job.submit(**submit_kwargs) diff --git a/tests/unit/src/common/vertex_ai_test.py b/tests/unit/src/common/vertex_ai_test.py index 1f36c7463..ef1b3411a 100644 --- a/tests/unit/src/common/vertex_ai_test.py +++ b/tests/unit/src/common/vertex_ai_test.py @@ -4,11 +4,7 @@ from absl.testing import absltest -from gigl.common.services.vertex_ai import ( - VertexAiJobConfig, - VertexAIService, - _sanitize_vertex_run_id, -) +from gigl.common.services.vertex_ai import VertexAiJobConfig, VertexAIService from tests.test_assets.test_case import TestCase @@ -118,7 +114,8 @@ def test_submit_job_uses_experiment_when_set( mock_job.submit.assert_called_once() submit_kwargs = mock_job.submit.call_args.kwargs self.assertEqual(submit_kwargs["experiment"], "my-comparison") - self.assertEqual(submit_kwargs["experiment_run"], job_config.job_name) + # experiment_run intentionally NOT set: Vertex auto-generates one. + self.assertNotIn("experiment_run", submit_kwargs) self.assertNotIn("tensorboard", submit_kwargs) @patch("gigl.common.services.vertex_ai.aiplatform.CustomJob") @@ -263,82 +260,8 @@ def test_experiment_exists_matching_backing_tb_is_noop( mock_existing_experiment.assign_backing_tensorboard.assert_not_called() -class TestSanitizeVertexRunId(TestCase): - """Tests for the _sanitize_vertex_run_id helper.""" - - def test_lowercases_and_replaces_underscores(self) -> None: - self.assertEqual( - _sanitize_vertex_run_id("gigl_train_hom_cora_sup_test_on_20260504_192055"), - "gigl-train-hom-cora-sup-test-on-20260504-192055", - ) - - def test_already_valid_passthrough(self) -> None: - self.assertEqual( - _sanitize_vertex_run_id("already-valid-id-123"), - "already-valid-id-123", - ) - - def test_uppercase_lowered(self) -> None: - self.assertEqual(_sanitize_vertex_run_id("MyJob"), "myjob") - - def test_invalid_chars_after_sanitization_raises(self) -> None: - with self.assertRaises(ValueError) as ctx: - _sanitize_vertex_run_id("has spaces and !@#") - self.assertIn("Vertex AI ExperimentRun ID", str(ctx.exception)) - - def test_leading_hyphen_rejected(self) -> None: - # Underscores at the start become hyphens, which violates the regex - # (the first character must be alphanumeric). - with self.assertRaises(ValueError): - _sanitize_vertex_run_id("_leading_underscore") - - -class TestSubmitJobSanitizesRunId(TestCase): - """Tests that _submit_job applies sanitization and validation.""" - - @patch("gigl.common.services.vertex_ai.aiplatform.Experiment") - @patch("gigl.common.services.vertex_ai.aiplatform.CustomJob") - @patch("gigl.common.services.vertex_ai.aiplatform.init") - def test_underscored_job_name_is_sanitized_for_experiment_run( - self, - mock_aiplatform_init: Mock, - mock_custom_job_class: Mock, - mock_experiment_cls: Mock, - ) -> None: - """The GiGL ``gigl_train_...`` job name must be coerced to a valid Vertex AI run ID.""" - mock_exp = Mock() - mock_exp.get_backing_tensorboard_resource.return_value = Mock( - resource_name="projects/test/locations/us-central1/tensorboards/123" - ) - mock_experiment_cls.get.return_value = mock_exp - - mock_job = Mock() - mock_job.resource_name = "projects/test/locations/us-central1/customJobs/456" - mock_job.name = "456" - mock_custom_job_class.return_value = mock_job - - service = VertexAIService( - project="test-project", - location="us-central1", - service_account="svc@test-project.iam.gserviceaccount.com", - staging_bucket="gs://test-staging-bucket", - ) - - job_config = VertexAiJobConfig( - job_name="gigl_train_my_run_20260504", - container_uri="gcr.io/test/image:latest", - command=["python", "-m", "trainer"], - base_output_dir="gs://test-perm-bucket/run/trainer", - tensorboard_resource_name="projects/test/locations/us-central1/tensorboards/123", - tensorboard_experiment_name="my-comparison", - ) - - service.launch_job(job_config=job_config) - - submit_kwargs = mock_job.submit.call_args.kwargs - self.assertEqual( - submit_kwargs["experiment_run"], "gigl-train-my-run-20260504" - ) +class TestSubmitJobValidatesExperimentName(TestCase): + """Tests that _submit_job validates the user-supplied experiment name.""" @patch("gigl.common.services.vertex_ai.aiplatform.CustomJob") @patch("gigl.common.services.vertex_ai.aiplatform.init") From e19f1050bed531ad94ac34b85e43e5b898bd1711 Mon Sep 17 00:00:00 2001 From: kmontemayor Date: Mon, 4 May 2026 22:51:49 +0000 Subject: [PATCH 34/59] tensorboard: stream events from chief rank, drop submit(experiment=) path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Vertex's auto-uploader is gated on submit(tensorboard=); submit(experiment=) alone leaves AIP_TENSORBOARD_LOG_DIR populated but never reads from it, so events sit in GCS un-uploaded (job 6570151780682825728 is the smoking gun). The two submit kwargs are mutually exclusive, and tensorboard= forces a job-scoped experiment we can't rename — neither delivers multi-run comparison. Switch approach: when tensorboard_experiment_name is set, the launcher injects GIGL_TENSORBOARD_RESOURCE_NAME and GIGL_TENSORBOARD_EXPERIMENT_NAME into the worker container, and submits with NEITHER experiment= nor tensorboard=. The trainer's TensorBoardWriter.from_env reads those env vars on the chief rank and starts aiplatform.start_upload_tb_log to stream AIP_TENSORBOARD_LOG_DIR to the named TensorboardExperiment. close() pairs with end_upload_tb_log. Multiple jobs sharing the experiment name show as comparable runs on one TB page. Drops the now-unused _ensure_experiment_with_backing_tb helper (the SDK uploader auto-creates the TensorboardExperiment). --- gigl/common/services/vertex_ai.py | 59 ++------ gigl/src/common/vertex_ai_launcher.py | 26 +++- gigl/utils/tensorboard_writer.py | 112 +++++++++++++-- .../src/common/vertex_ai_launcher_test.py | 61 +++++++++ tests/unit/src/common/vertex_ai_test.py | 127 +----------------- tests/unit/utils/tensorboard_writer_test.py | 96 +++++++++++++ 6 files changed, 298 insertions(+), 183 deletions(-) diff --git a/gigl/common/services/vertex_ai.py b/gigl/common/services/vertex_ai.py index 4ad0bface..3e1f764cd 100644 --- a/gigl/common/services/vertex_ai.py +++ b/gigl/common/services/vertex_ai.py @@ -354,47 +354,6 @@ def launch_graph_store_job( return self._submit_job(worker_pool_specs, compute_pool_job_config) - def _ensure_experiment_with_backing_tb( - self, - experiment_name: str, - tensorboard_resource_name: str, - ) -> None: - """Ensure ``experiment_name`` exists with ``tensorboard_resource_name`` as its backing TB. - - Idempotent. Creates the Vertex AI Experiment if missing and assigns the - backing TB. If the experiment already exists with a different backing - TB, raises ``ValueError`` (silently uploading to the wrong TB would be - surprising and hard to debug). - - Args: - experiment_name: The name of the Vertex AI Experiment. - tensorboard_resource_name: The fully-qualified resource name of the - Vertex AI Tensorboard to use as the backing TB. - - Raises: - ValueError: If the experiment already exists with a different - backing tensorboard resource name. - """ - experiment = aiplatform.Experiment.get(experiment_name) - if experiment is None: - experiment = aiplatform.Experiment.create(experiment_name) - experiment.assign_backing_tensorboard(tensorboard_resource_name) - return - - backing = experiment.get_backing_tensorboard_resource() - if backing is None: - experiment.assign_backing_tensorboard(tensorboard_resource_name) - return - - if backing.resource_name != tensorboard_resource_name: - raise ValueError( - f"Vertex AI Experiment {experiment_name!r} already has a " - f"backing tensorboard {backing.resource_name!r} that does not " - f"match the configured {tensorboard_resource_name!r}. Either " - "use a fresh experiment name or update the resource config to " - "the existing backing TB." - ) - def _submit_job( self, worker_pool_specs: Union[list[WorkerPoolSpec], list[dict]], @@ -430,15 +389,15 @@ def _submit_job( f"is not a valid Vertex AI Experiment ID; it must match " f"{_VERTEX_RESOURCE_ID_PATTERN.pattern}." ) - self._ensure_experiment_with_backing_tb( - experiment_name=job_config.tensorboard_experiment_name, - tensorboard_resource_name=job_config.tensorboard_resource_name, - ) - # Don't pass experiment_run: when experiment is set but - # experiment_run is not, the SDK auto-generates an ExperimentRun - # for this job. Passing a name here invokes the run *getter*, - # which 404s for a not-yet-created run. - submit_kwargs["experiment"] = job_config.tensorboard_experiment_name + # Don't set ``experiment=`` or ``tensorboard=`` on submit. The + # SDK forbids both together, ``experiment=`` alone does NOT + # trigger TB streaming (Vertex's auto-uploader is gated on + # ``tensorboard=``), and ``tensorboard=`` alone uploads to a + # job-scoped experiment we can't rename. Instead, the launcher + # has injected ``GIGL_TENSORBOARD_*`` env vars into the worker + # container, and the trainer's ``TensorBoardWriter.from_env`` + # runs ``aiplatform.start_upload_tb_log`` on the chief rank to + # stream events to the user-chosen experiment. else: submit_kwargs["tensorboard"] = job_config.tensorboard_resource_name job.submit(**submit_kwargs) diff --git a/gigl/src/common/vertex_ai_launcher.py b/gigl/src/common/vertex_ai_launcher.py index cb83d1663..99139b154 100644 --- a/gigl/src/common/vertex_ai_launcher.py +++ b/gigl/src/common/vertex_ai_launcher.py @@ -297,12 +297,36 @@ def _build_job_config( else None ) + # When the user opted into a stable Vertex AI TensorboardExperiment, + # ``VertexAIService._submit_job`` does NOT pass ``tensorboard=`` on submit + # (Vertex's auto-uploader would route to a job-scoped experiment we can't + # rename). Instead, the chief-rank trainer streams events itself via + # ``aiplatform.start_upload_tb_log``. Inject the resource name and + # experiment name as container env vars so the trainer can find them. + container_env_vars = list(env_vars) + if ( + tensorboard_experiment_name + and vertex_ai_resource_config.tensorboard_resource_name + ): + container_env_vars.extend( + [ + env_var.EnvVar( + name="GIGL_TENSORBOARD_RESOURCE_NAME", + value=vertex_ai_resource_config.tensorboard_resource_name, + ), + env_var.EnvVar( + name="GIGL_TENSORBOARD_EXPERIMENT_NAME", + value=tensorboard_experiment_name, + ), + ] + ) + job_config = VertexAiJobConfig( job_name=job_name, container_uri=container_uri, command=command, args=job_args, - environment_variables=env_vars, + environment_variables=container_env_vars, machine_type=vertex_ai_resource_config.machine_type, accelerator_type=vertex_ai_resource_config.gpu_type.upper().replace("-", "_"), accelerator_count=vertex_ai_resource_config.gpu_limit, diff --git a/gigl/utils/tensorboard_writer.py b/gigl/utils/tensorboard_writer.py index 3e0e674eb..bf09556a5 100644 --- a/gigl/utils/tensorboard_writer.py +++ b/gigl/utils/tensorboard_writer.py @@ -1,7 +1,8 @@ """TensorBoard writer for GiGL training entrypoints.""" import os -from typing import Any, Optional +import re +from typing import Any, Final, Optional import tensorflow as tf @@ -15,7 +16,25 @@ # References: # https://cloud.google.com/vertex-ai/docs/training/code-requirements # https://cloud.google.com/vertex-ai/docs/reference/rest/v1/CustomJobSpec#FIELDS.base_output_directory -_VERTEX_TENSORBOARD_LOG_DIR_ENV_KEY = "AIP_TENSORBOARD_LOG_DIR" +_VERTEX_TENSORBOARD_LOG_DIR_ENV_KEY: Final[str] = "AIP_TENSORBOARD_LOG_DIR" + +# Set by GiGL's launcher (``gigl/src/common/vertex_ai_launcher.py``) when the +# user requested a stable Vertex AI ``TensorboardExperiment`` for cross-job +# comparison. When both env vars are set on the chief rank, the writer also +# starts a background uploader (``aiplatform.start_upload_tb_log``) that +# streams events from the log dir to that experiment's backing TB. Without +# these, the writer just writes files to ``AIP_TENSORBOARD_LOG_DIR`` and no +# upload happens. +_GIGL_TENSORBOARD_RESOURCE_NAME_ENV_KEY: Final[str] = "GIGL_TENSORBOARD_RESOURCE_NAME" +_GIGL_TENSORBOARD_EXPERIMENT_NAME_ENV_KEY: Final[str] = ( + "GIGL_TENSORBOARD_EXPERIMENT_NAME" +) + +_TENSORBOARD_RESOURCE_NAME_PATTERN: Final[re.Pattern[str]] = re.compile( + r"^projects/(?P[^/]+)" + r"/locations/(?P[^/]+)" + r"/tensorboards/(?P[^/]+)$" +) class TensorBoardWriter: @@ -32,17 +51,26 @@ class TensorBoardWriter: ... tb.log({"Loss/train": loss, "Loss/val": vloss}, step=batch_idx) """ - def __init__(self, log_dir: Optional[str]) -> None: + def __init__( + self, + log_dir: Optional[str], + *, + upload_started: bool = False, + ) -> None: """Initialize the writer. Args: log_dir: Destination directory for TensorBoard events. When ``None``, the writer is a no-op and allocates no TF resources. + upload_started: Whether ``aiplatform.start_upload_tb_log`` has + been called and needs a paired ``end_upload_tb_log`` on + ``close()``. """ self._writer: Optional[Any] = ( tf.summary.create_file_writer(log_dir) if log_dir else None ) self._closed = False + self._upload_started = upload_started @classmethod def from_env(cls, *, enabled: bool = True) -> "TensorBoardWriter": @@ -52,10 +80,18 @@ def from_env(cls, *, enabled: bool = True) -> "TensorBoardWriter": the environment. This is the path non-chief ranks take so they can share the same call sites as the chief. - When ``enabled`` is ``True``, the env var must be set; otherwise this - raises ``RuntimeError`` rather than silently no-op'ing. The env var is - populated by Vertex AI from ``CustomJobSpec.baseOutputDirectory`` (see - the references in this module's header). + When ``enabled`` is ``True``: + + - ``AIP_TENSORBOARD_LOG_DIR`` must be set; otherwise this raises + ``RuntimeError`` rather than silently no-op'ing. The env var is + populated by Vertex AI from ``CustomJobSpec.baseOutputDirectory`` + (see the references in this module's header). + - If ``GIGL_TENSORBOARD_RESOURCE_NAME`` and + ``GIGL_TENSORBOARD_EXPERIMENT_NAME`` are also set, this also starts + a background ``aiplatform`` uploader that streams events from the + log dir to the named ``TensorboardExperiment`` under the configured + ``Tensorboard`` instance. The uploader is shut down on + :meth:`close`. Args: enabled: Whether this caller is responsible for writing events. @@ -67,6 +103,8 @@ def from_env(cls, *, enabled: bool = True) -> "TensorBoardWriter": Raises: RuntimeError: If ``enabled`` is True and ``AIP_TENSORBOARD_LOG_DIR`` is not set in the environment. + ValueError: If ``GIGL_TENSORBOARD_RESOURCE_NAME`` is set but does + not match ``projects/.../locations/.../tensorboards/...``. """ if not enabled: return cls(log_dir=None) @@ -78,7 +116,9 @@ def from_env(cls, *, enabled: bool = True) -> "TensorBoardWriter": "a Vertex AI CustomJob with baseOutputDirectory configured. " "See https://cloud.google.com/vertex-ai/docs/reference/rest/v1/CustomJobSpec#FIELDS.base_output_directory." ) - return cls(log_dir=log_dir) + + upload_started = _maybe_start_uploader(log_dir=log_dir) + return cls(log_dir=log_dir, upload_started=upload_started) def log(self, metrics: dict[str, float], step: int) -> None: """Write each metric scalar at ``step`` and flush. @@ -98,16 +138,66 @@ def log(self, metrics: dict[str, float], step: int) -> None: self._writer.flush() def close(self) -> None: - """Close the underlying TF writer. + """Close the underlying TF writer and stop the uploader if running. Idempotent; safe to call multiple times and on no-op writers. """ - if self._writer is not None and not self._closed: + if self._closed: + return + if self._writer is not None: self._writer.close() - self._closed = True + if self._upload_started: + # Local import keeps the optional aiplatform dependency out of + # the no-op path. + from google.cloud import aiplatform + + aiplatform.end_upload_tb_log() + self._closed = True def __enter__(self) -> "TensorBoardWriter": return self def __exit__(self, *_exc: object) -> None: self.close() + + +def _maybe_start_uploader(*, log_dir: str) -> bool: + """Start the aiplatform TB uploader iff the GiGL env vars are present. + + Returns ``True`` if the uploader was started (caller must arrange for + ``aiplatform.end_upload_tb_log`` on shutdown), ``False`` otherwise. + + Args: + log_dir: Directory the uploader watches for new event files. + + Raises: + ValueError: If ``GIGL_TENSORBOARD_RESOURCE_NAME`` is set but does not + match the expected resource-name format. + """ + tb_resource_name = os.environ.get(_GIGL_TENSORBOARD_RESOURCE_NAME_ENV_KEY) + experiment_name = os.environ.get(_GIGL_TENSORBOARD_EXPERIMENT_NAME_ENV_KEY) + if not tb_resource_name or not experiment_name: + return False + + match = _TENSORBOARD_RESOURCE_NAME_PATTERN.match(tb_resource_name) + if not match: + raise ValueError( + f"{_GIGL_TENSORBOARD_RESOURCE_NAME_ENV_KEY}={tb_resource_name!r} " + "does not match projects/.../locations/.../tensorboards/...; " + "the GiGL launcher should set this to the same resource name " + "configured on GiglResourceConfig." + ) + + # Local import: aiplatform is only needed when the user opts in. + from google.cloud import aiplatform + + aiplatform.init( + project=match["project"], + location=match["location"], + ) + aiplatform.start_upload_tb_log( + tensorboard_id=match["tensorboard_id"], + tensorboard_experiment_name=experiment_name, + logdir=log_dir, + ) + return True diff --git a/tests/unit/src/common/vertex_ai_launcher_test.py b/tests/unit/src/common/vertex_ai_launcher_test.py index dd54e1582..0579f0389 100644 --- a/tests/unit/src/common/vertex_ai_launcher_test.py +++ b/tests/unit/src/common/vertex_ai_launcher_test.py @@ -488,6 +488,67 @@ def test_build_job_config_experiment_name_default(self) -> None: ) self.assertIsNone(cfg.tensorboard_experiment_name) + def test_build_job_config_injects_gigl_tensorboard_env_vars(self) -> None: + """When tensorboard_experiment_name is set with a TB resource, the + launcher injects env vars so the trainer's chief-rank uploader can + find the destination experiment. + """ + resource_config = gigl_resource_config_pb2.VertexAiResourceConfig( + machine_type="n1-standard-4", + gpu_type="ACCELERATOR_TYPE_UNSPECIFIED", + gpu_limit=0, + num_replicas=1, + tensorboard_resource_name="projects/p/locations/us/tensorboards/1", + ) + cfg = _build_job_config( + job_name="job", + task_config_uri=Uri("gs://b/task.yaml"), + resource_config_uri=Uri("gs://b/resource.yaml"), + command_str="python -m gigl.src.training.v2.glt_trainer", + args={}, + use_cuda=False, + container_uri="gcr.io/p/img", + vertex_ai_resource_config=resource_config, + env_vars=[], + tensorboard_logs_uri=Uri("gs://b/run/logs/"), + tensorboard_experiment_name="my-comparison", + ) + env = {ev.name: ev.value for ev in cfg.environment_variables or []} + self.assertEqual( + env["GIGL_TENSORBOARD_RESOURCE_NAME"], + "projects/p/locations/us/tensorboards/1", + ) + self.assertEqual(env["GIGL_TENSORBOARD_EXPERIMENT_NAME"], "my-comparison") + + def test_build_job_config_no_gigl_env_vars_when_experiment_name_unset( + self, + ) -> None: + """The GIGL_TENSORBOARD_* env vars are NOT injected on the legacy + ``submit(tensorboard=...)`` path. + """ + resource_config = gigl_resource_config_pb2.VertexAiResourceConfig( + machine_type="n1-standard-4", + gpu_type="ACCELERATOR_TYPE_UNSPECIFIED", + gpu_limit=0, + num_replicas=1, + tensorboard_resource_name="projects/p/locations/us/tensorboards/1", + ) + cfg = _build_job_config( + job_name="job", + task_config_uri=Uri("gs://b/task.yaml"), + resource_config_uri=Uri("gs://b/resource.yaml"), + command_str="python -m gigl.src.training.v2.glt_trainer", + args={}, + use_cuda=False, + container_uri="gcr.io/p/img", + vertex_ai_resource_config=resource_config, + env_vars=[], + tensorboard_logs_uri=Uri("gs://b/run/logs/"), + ) + env_names = {ev.name for ev in cfg.environment_variables or []} + self.assertNotIn("GIGL_TENSORBOARD_RESOURCE_NAME", env_names) + self.assertNotIn("GIGL_TENSORBOARD_EXPERIMENT_NAME", env_names) + if __name__ == "__main__": absltest.main() diff --git a/tests/unit/src/common/vertex_ai_test.py b/tests/unit/src/common/vertex_ai_test.py index ef1b3411a..50075e679 100644 --- a/tests/unit/src/common/vertex_ai_test.py +++ b/tests/unit/src/common/vertex_ai_test.py @@ -72,22 +72,17 @@ def test_vertex_ai_job_config_carries_experiment_name(self) -> None: ) self.assertEqual(cfg.tensorboard_experiment_name, "my-comparison") - @patch("gigl.common.services.vertex_ai.aiplatform.Experiment") @patch("gigl.common.services.vertex_ai.aiplatform.CustomJob") @patch("gigl.common.services.vertex_ai.aiplatform.init") - def test_submit_job_uses_experiment_when_set( + def test_submit_job_skips_experiment_and_tensorboard_when_experiment_name_set( self, mock_aiplatform_init: Mock, mock_custom_job_class: Mock, - mock_experiment_cls: Mock, ) -> None: - """When tensorboard_experiment_name is set, submit uses experiment= and experiment_run= instead of tensorboard=.""" - mock_exp = Mock() - mock_exp.get_backing_tensorboard_resource.return_value = Mock( - resource_name="projects/test/locations/us-central1/tensorboards/123" - ) - mock_experiment_cls.get.return_value = mock_exp - + """When tensorboard_experiment_name is set, submit passes neither + ``experiment`` nor ``tensorboard`` — the trainer streams events itself + via the chief-rank uploader. + """ mock_job = Mock() mock_job.resource_name = "projects/test/locations/us-central1/customJobs/456" mock_job.name = "456" @@ -113,8 +108,7 @@ def test_submit_job_uses_experiment_when_set( mock_job.submit.assert_called_once() submit_kwargs = mock_job.submit.call_args.kwargs - self.assertEqual(submit_kwargs["experiment"], "my-comparison") - # experiment_run intentionally NOT set: Vertex auto-generates one. + self.assertNotIn("experiment", submit_kwargs) self.assertNotIn("experiment_run", submit_kwargs) self.assertNotIn("tensorboard", submit_kwargs) @@ -151,115 +145,6 @@ def test_submit_job_raises_when_experiment_name_set_but_no_tb_resource( self.assertIn("tensorboard_resource_name", str(ctx.exception)) -class TestEnsureExperimentWithBackingTb(TestCase): - """Tests for VertexAIService._ensure_experiment_with_backing_tb.""" - - _TB_RESOURCE_NAME = "projects/p/locations/us-central1/tensorboards/42" - _EXPERIMENT_NAME = "my-experiment" - - def _make_service(self, mock_init: Mock) -> VertexAIService: - return VertexAIService( - project="test-project", - location="us-central1", - service_account="svc@test.iam.gserviceaccount.com", - staging_bucket="gs://test-bucket", - ) - - @patch("gigl.common.services.vertex_ai.aiplatform.Experiment") - @patch("gigl.common.services.vertex_ai.aiplatform.init") - def test_experiment_does_not_exist_creates_and_assigns( - self, - mock_init: Mock, - mock_experiment_class: Mock, - ) -> None: - """When the experiment doesn't exist, creates it and assigns backing TB.""" - mock_experiment_class.get.return_value = None - mock_new_experiment = Mock() - mock_experiment_class.create.return_value = mock_new_experiment - - service = self._make_service(mock_init) - service._ensure_experiment_with_backing_tb( - self._EXPERIMENT_NAME, self._TB_RESOURCE_NAME - ) - - mock_experiment_class.get.assert_called_once_with(self._EXPERIMENT_NAME) - mock_experiment_class.create.assert_called_once_with(self._EXPERIMENT_NAME) - mock_new_experiment.assign_backing_tensorboard.assert_called_once_with( - self._TB_RESOURCE_NAME - ) - - @patch("gigl.common.services.vertex_ai.aiplatform.Experiment") - @patch("gigl.common.services.vertex_ai.aiplatform.init") - def test_experiment_exists_no_backing_tb_assigns( - self, - mock_init: Mock, - mock_experiment_class: Mock, - ) -> None: - """When the experiment exists with no backing TB, assigns the backing TB.""" - mock_existing_experiment = Mock() - mock_existing_experiment.get_backing_tensorboard_resource.return_value = None - mock_experiment_class.get.return_value = mock_existing_experiment - - service = self._make_service(mock_init) - service._ensure_experiment_with_backing_tb( - self._EXPERIMENT_NAME, self._TB_RESOURCE_NAME - ) - - mock_experiment_class.create.assert_not_called() - mock_existing_experiment.assign_backing_tensorboard.assert_called_once_with( - self._TB_RESOURCE_NAME - ) - - @patch("gigl.common.services.vertex_ai.aiplatform.Experiment") - @patch("gigl.common.services.vertex_ai.aiplatform.init") - def test_experiment_exists_different_backing_tb_raises( - self, - mock_init: Mock, - mock_experiment_class: Mock, - ) -> None: - """When the experiment exists with a different backing TB, raises ValueError.""" - mock_backing = Mock() - mock_backing.resource_name = "projects/p/locations/us-central1/tensorboards/99" - mock_existing_experiment = Mock() - mock_existing_experiment.get_backing_tensorboard_resource.return_value = ( - mock_backing - ) - mock_experiment_class.get.return_value = mock_existing_experiment - - service = self._make_service(mock_init) - with self.assertRaises(ValueError) as ctx: - service._ensure_experiment_with_backing_tb( - self._EXPERIMENT_NAME, self._TB_RESOURCE_NAME - ) - - self.assertIn("backing tensorboard", str(ctx.exception).lower()) - - @patch("gigl.common.services.vertex_ai.aiplatform.Experiment") - @patch("gigl.common.services.vertex_ai.aiplatform.init") - def test_experiment_exists_matching_backing_tb_is_noop( - self, - mock_init: Mock, - mock_experiment_class: Mock, - ) -> None: - """When the experiment exists with the correct backing TB, does nothing.""" - mock_backing = Mock() - mock_backing.resource_name = self._TB_RESOURCE_NAME - mock_existing_experiment = Mock() - mock_existing_experiment.get_backing_tensorboard_resource.return_value = ( - mock_backing - ) - mock_experiment_class.get.return_value = mock_existing_experiment - - service = self._make_service(mock_init) - # Should not raise and should not call assign or create - service._ensure_experiment_with_backing_tb( - self._EXPERIMENT_NAME, self._TB_RESOURCE_NAME - ) - - mock_experiment_class.create.assert_not_called() - mock_existing_experiment.assign_backing_tensorboard.assert_not_called() - - class TestSubmitJobValidatesExperimentName(TestCase): """Tests that _submit_job validates the user-supplied experiment name.""" diff --git a/tests/unit/utils/tensorboard_writer_test.py b/tests/unit/utils/tensorboard_writer_test.py index 9d11eca6d..3086483fc 100644 --- a/tests/unit/utils/tensorboard_writer_test.py +++ b/tests/unit/utils/tensorboard_writer_test.py @@ -111,5 +111,101 @@ def test_close_on_noop_writer_does_not_raise(self) -> None: writer.close() # Idempotent on no-op writer. +class TestTensorBoardWriterUploader(TestCase): + """Tests for the chief-rank ``aiplatform.start_upload_tb_log`` hook.""" + + _LOG_DIR = "gs://vertex-managed/logs" + _TB_RESOURCE = "projects/my-project/locations/us-central1/tensorboards/42" + _EXPERIMENT = "my-comparison" + + def test_uploader_starts_when_both_env_vars_present(self) -> None: + with patch.dict( + os.environ, + { + "AIP_TENSORBOARD_LOG_DIR": self._LOG_DIR, + "GIGL_TENSORBOARD_RESOURCE_NAME": self._TB_RESOURCE, + "GIGL_TENSORBOARD_EXPERIMENT_NAME": self._EXPERIMENT, + }, + clear=True, + ): + with patch( + "gigl.utils.tensorboard_writer.tf.summary.create_file_writer" + ): + with patch( + "google.cloud.aiplatform.start_upload_tb_log" + ) as mock_start, patch( + "google.cloud.aiplatform.init" + ) as mock_init, patch( + "google.cloud.aiplatform.end_upload_tb_log" + ) as mock_end: + writer = TensorBoardWriter.from_env() + writer.close() + + mock_init.assert_called_once_with( + project="my-project", location="us-central1" + ) + mock_start.assert_called_once_with( + tensorboard_id="42", + tensorboard_experiment_name=self._EXPERIMENT, + logdir=self._LOG_DIR, + ) + mock_end.assert_called_once() + + def test_uploader_does_not_start_when_only_log_dir_set(self) -> None: + with patch.dict( + os.environ, + {"AIP_TENSORBOARD_LOG_DIR": self._LOG_DIR}, + clear=True, + ): + with patch( + "gigl.utils.tensorboard_writer.tf.summary.create_file_writer" + ): + with patch( + "google.cloud.aiplatform.start_upload_tb_log" + ) as mock_start, patch( + "google.cloud.aiplatform.end_upload_tb_log" + ) as mock_end: + writer = TensorBoardWriter.from_env() + writer.close() + + mock_start.assert_not_called() + mock_end.assert_not_called() + + def test_invalid_tb_resource_name_raises(self) -> None: + with patch.dict( + os.environ, + { + "AIP_TENSORBOARD_LOG_DIR": self._LOG_DIR, + "GIGL_TENSORBOARD_RESOURCE_NAME": "not-a-valid-resource-name", + "GIGL_TENSORBOARD_EXPERIMENT_NAME": self._EXPERIMENT, + }, + clear=True, + ): + with patch( + "gigl.utils.tensorboard_writer.tf.summary.create_file_writer" + ): + with self.assertRaises(ValueError) as ctx: + TensorBoardWriter.from_env() + + self.assertIn("GIGL_TENSORBOARD_RESOURCE_NAME", str(ctx.exception)) + + def test_uploader_skipped_for_disabled_writer(self) -> None: + """Non-chief ranks (enabled=False) skip both the writer and uploader.""" + with patch.dict( + os.environ, + { + "AIP_TENSORBOARD_LOG_DIR": self._LOG_DIR, + "GIGL_TENSORBOARD_RESOURCE_NAME": self._TB_RESOURCE, + "GIGL_TENSORBOARD_EXPERIMENT_NAME": self._EXPERIMENT, + }, + clear=True, + ): + with patch("google.cloud.aiplatform.start_upload_tb_log") as mock_start: + writer = TensorBoardWriter.from_env(enabled=False) + writer.close() + + mock_start.assert_not_called() + + if __name__ == "__main__": absltest.main() From fea1a9d53e97c40508745033648323ab23d08bec Mon Sep 17 00:00:00 2001 From: kmontemayor Date: Tue, 5 May 2026 16:41:31 +0000 Subject: [PATCH 35/59] vertex_ai: always pass tensorboard= so VAI job page links to TB The "Open TensorBoard" link on the Vertex AI job page is gated on jobSpec.tensorboard. Dropping that field (which the previous experiment-name branch did) hides the link entirely. Make _submit_job set tensorboard= any time the resource name is configured. The chief-rank uploader continues to stream to the user-named TensorboardExperiment in parallel for cross-job comparison. Refresh the now-stale dual-uploader story in the dataclass docstring, launcher comment block, and proto comment. Also adds the round-2 plan to docs/plans/. --- Makefile | 2 +- docs/plans/20260505-tb-multi-job-iteration.md | 237 ++++++++++++++++++ .../configs/e2e_hom_cora_sup_task_config.yaml | 2 +- gigl/common/services/vertex_ai.py | 32 +-- gigl/src/common/vertex_ai_launcher.py | 13 +- .../snapchat/research/gbml/gbml_config.proto | 17 +- .../gbml/gbml_config/GbmlConfig.scala | 17 +- .../gbml/gbml_config/GbmlConfig.scala | 17 +- snapchat/research/gbml/gbml_config_pb2.pyi | 17 +- tests/unit/src/common/vertex_ai_test.py | 13 +- 10 files changed, 307 insertions(+), 60 deletions(-) create mode 100644 docs/plans/20260505-tb-multi-job-iteration.md diff --git a/Makefile b/Makefile index dc3accd94..e76dbc37c 100644 --- a/Makefile +++ b/Makefile @@ -260,7 +260,7 @@ run_all_e2e_tests: # Example: # `make compiled_pipeline_path="/tmp/gigl/my_pipeline.yaml" compile_gigl_kubeflow_pipeline` # Can be a GCS URI as well -compile_gigl_kubeflow_pipeline: compile_jars push_new_docker_images +compile_gigl_kubeflow_pipeline: push_new_docker_images uv run python -m gigl.orchestration.kubeflow.runner \ --action=compile \ --container_image_cuda=${DOCKER_IMAGE_MAIN_CUDA_NAME_WITH_TAG} \ diff --git a/docs/plans/20260505-tb-multi-job-iteration.md b/docs/plans/20260505-tb-multi-job-iteration.md new file mode 100644 index 000000000..4eec56845 --- /dev/null +++ b/docs/plans/20260505-tb-multi-job-iteration.md @@ -0,0 +1,237 @@ +# Multi-Job TensorBoard: Local Iteration & Final Design Plan + +Date: 2026-05-05 +Branch: `kmonte/add-tb-for-glt` + +This plan supersedes the earlier branch plan at `docs/plans/20260504-tb-experiment-name-proto.md`. It incorporates findings from two Codex plan reviews — round 1 at `.claude/tmp/codex-verify/20260505-155740-plan-crystalline-giggling-backus/review.md` and round 2 at `.claude/tmp/codex-verify/20260505-161326-plan-crystalline-giggling-backus/review.md`. Round-2 deltas (e.g. uniqueness via timestamp suffix, returning the `CustomJob` from `launch_single_pool_job`, `--container-uri` required, no commit of experiment name into the e2e CORA config) are applied during implementation, not via plan edits. + +## Context + +Across three full-pipeline iterations on this branch we've cycled through three TB integration designs, each broken in a different way: + +1. **`submit(tensorboard=…)`** — auto-uploader runs, but the destination `TensorboardExperiment` is named after the (numeric) `CustomJob` ID. Per-job page works (R1 ✓), but multiple jobs cannot share one TB page (R2 ✗). +2. **`submit(experiment=…)`** — never streams events. The SDK's `experiment=` is for Vertex AI Experiments parameter/metric tracking; Vertex's TB auto-uploader is gated on `jobSpec.tensorboard` being set, which `experiment=` is mutually exclusive with. Result: events written to `AIP_TENSORBOARD_LOG_DIR` sit in GCS un-uploaded. Job 6570151780682825728 confirmed this empirically. +3. **Custom uploader from chief rank, no `tensorboard=`** — events stream to the chosen experiment (R2 ✓), but the VAI job page no longer shows the "Open TensorBoard" link because that link is keyed on `jobSpec.tensorboard` (R1 ✗). Job 4543918976459079680 confirmed this. + +R1 (TB link from job page) and R2 (multi-job comparison) are not mutually exclusive — they just can't be satisfied by a single mechanism. The right approach combines both: server-side auto-uploader for the job-page link, plus a chief-rank uploader for the cross-job comparison experiment, pointing at two different `TensorboardExperiment`s under the same `Tensorboard` instance. Implementation is small; the risk is verifying behavior end-to-end. The fix for that is a tight local iteration loop. + +## Success criteria + +| ID | Criterion | How verified | +|----|-----------|--------------| +| R1 | The Vertex AI job UI shows "Open TensorBoard" for a successful job, and clicking it loads the per-job experiment with this job's scalar runs. | Manual: open the job in the cloud console; click the link. | +| R2 | Two jobs submitted with the same `tensorboardExperimentName` show **two distinct runs** on one TB page (the user-named experiment), each carrying its own scalars. | Manual: open the named experiment URL; toggle both runs in the scalars dashboard. Smoke script also asserts run count + ≥1 `TensorboardTimeSeries` per run. | +| R3 | Jobs without `tensorboardExperimentName` keep working: events flow to a per-job auto-named experiment. No regression. | Existing `tests/unit/src/common/vertex_ai_test.py::test_submit_job_passes_tensorboard_and_base_output_dir` plus a smoke run with the field unset. | +| R4 | `make unit_test_py` and `make type_check` pass on the branch. | CI / local. | +| R5 (process) | A new dev script lets us submit a tiny CustomJob from a laptop and verify R1+R2 in <2 minutes, end-to-end. | Run it twice; time both invocations. | +| R6 | Trainer process exits cleanly even when training fails — the chief-rank uploader does not hang the worker. | Inspected via the `try/finally` (or `with`) wrapping in all four training entrypoints; `make unit_test_py` covers the writer's idempotent close. | + +## Final design + +**(A) Set `jobSpec.tensorboard=` on every job that has a TB resource configured (even when an experiment name is also set).** This restores the VAI job-page TB link unconditionally and continues to populate `AIP_TENSORBOARD_RESOURCE_NAME` and `AIP_TENSORBOARD_LOG_DIR` in the worker. Vertex's auto-uploader streams events to a per-job experiment named after the job's numeric ID — that's R1. + +**(B) When `tensorboard_experiment_name` is set, the launcher injects three env vars:** + +- `GIGL_TENSORBOARD_RESOURCE_NAME` — full Tensorboard resource name (already injected at HEAD). +- `GIGL_TENSORBOARD_EXPERIMENT_NAME` — the user-chosen experiment name (already injected at HEAD). +- `GIGL_TENSORBOARD_RUN_NAME` — **new**: derived from the launcher's `job_name`, with `_` → `-` (so the GCS subdir name matches what the SDK's `reformat_run_name` will produce). Codex Issue 1 fix. + +**(C) `TensorBoardWriter.from_env()` (chief rank only):** + +- If `GIGL_TENSORBOARD_RUN_NAME` is set: write events to `//` (a *subdirectory*), not to the parent. This makes the run name visible to both the server-side auto-uploader and our chief-rank uploader as a `relpath` of the parent logdir, instead of the SDK's hardcoded `DEFAULT_RUN_NAME = "default"` (`.venv/lib/python3.11/site-packages/google/cloud/aiplatform/tensorboard/uploader_utils.py:44`). Two jobs with different run names → two distinct runs in the named experiment. Codex Issue 1 fix. +- If `GIGL_TENSORBOARD_RUN_NAME` is unset: write to `AIP_TENSORBOARD_LOG_DIR` directly (today's behavior, R3 path). +- If both `GIGL_TENSORBOARD_RESOURCE_NAME` and `GIGL_TENSORBOARD_EXPERIMENT_NAME` are also set, additionally `aiplatform.start_upload_tb_log(tensorboard_id=…, tensorboard_experiment_name=…, logdir=AIP_TENSORBOARD_LOG_DIR)` — the parent logdir, not the subdir, so the uploader's `LogdirLoader` discovers the subdir as a run via `os.path.relpath`. **Do not pass `run_name_prefix`** — the subdir already gives us the run identity, and a non-empty prefix would concatenate awkwardly with the discovered run name. +- `close()` already pairs with `aiplatform.end_upload_tb_log()` (`gigl/utils/tensorboard_writer.py:149`). + +**(D) Always use `with TensorBoardWriter.from_env(...)` in trainer entrypoints.** The SDK uploader thread is **not** a daemon (`.venv/lib/python3.11/site-packages/google/cloud/aiplatform/tensorboard/uploader_tracker.py:162` — `threading.Thread(...).start()` without `daemon=True`); the SDK's docstring explicitly says to call `end_upload_tb_log()` in `finally` (`uploader_tracker.py:109`). Today's example trainers call `close()` only on the happy path. Codex Issue 3 fix: switch all four trainers to context-manager use. + +The `submit(experiment=…)` SDK path and the `_ensure_experiment_with_backing_tb` helper are not needed for either requirement; both are gone as of HEAD `e19f1050`. + +## Files to modify + +- `gigl/common/services/vertex_ai.py` — `_submit_job`: drop the experiment-name early branch; always set `tensorboard=` whenever `job_config.tensorboard_resource_name` is non-empty. Keep the experiment-name regex validation (fail-fast). Update the `VertexAiJobConfig` docstring around `gigl/common/services/vertex_ai.py:150` (Codex Issue 6). +- `gigl/src/common/vertex_ai_launcher.py` — `_build_job_config`: keep the existing `GIGL_TENSORBOARD_RESOURCE_NAME` / `GIGL_TENSORBOARD_EXPERIMENT_NAME` injection; **add** `GIGL_TENSORBOARD_RUN_NAME` (sanitized job name). Update the comment block at `gigl/src/common/vertex_ai_launcher.py:300` describing what `_submit_job` does (Codex Issue 6). +- `gigl/utils/tensorboard_writer.py` — `from_env()` reads `GIGL_TENSORBOARD_RUN_NAME` and uses it as a subdir of `AIP_TENSORBOARD_LOG_DIR` for the `tf.summary.create_file_writer` log_dir; `_maybe_start_uploader` still watches the parent logdir. +- `proto/snapchat/research/gbml/gbml_config.proto:204` — update the `tensorboard_experiment_name` comment to describe the dual-uploader behavior, not the dropped `experiment=`-backed design (Codex Issue 6). Run `make compile_protos` to regenerate Python + Scala stubs. +- `examples/link_prediction/configs/e2e_hom_cora_sup_task_config.yaml:26` — change `tensorboardExperimentName` from the personal `kmonte-test-experiment` to `homogeneous-link-prediction-comparison` (Codex Issue 5). +- `examples/link_prediction/homogeneous_training.py`, `examples/link_prediction/heterogeneous_training.py`, `examples/link_prediction/graph_store/homogeneous_training.py`, `examples/link_prediction/graph_store/heterogeneous_training.py` — replace the existing `tensorboard_writer = TensorBoardWriter.from_env(...)` + later `.close()` pattern with a `with` block. (Codex Issue 3 + Impact Analysis.) +- `tests/unit/src/common/vertex_ai_test.py` — rename `test_submit_job_skips_experiment_and_tensorboard_when_experiment_name_set` to `test_submit_job_passes_tensorboard_with_or_without_experiment_name` and assert `tensorboard=` is set in both branches. +- `tests/unit/src/common/vertex_ai_launcher_test.py` — assert `GIGL_TENSORBOARD_RUN_NAME` is injected when an experiment name is set; not injected otherwise. +- `tests/unit/utils/tensorboard_writer_test.py` — assert the writer's effective `log_dir` is the subdir (`//`) when `GIGL_TENSORBOARD_RUN_NAME` is set; assert `start_upload_tb_log` is called with `logdir=` (NOT the subdir) and no `run_name_prefix`. +- `tools/dev_submit_tb_smoke_job.py` — **new** local iteration tool. The `tools/` directory already exists in the repo (Codex correction). + +## Local iteration tool + +A standalone Python script that bypasses ConfigPopulator and the full pipeline. Goal: <2 min from "I changed code" to "I see whether TB shows up." + +Path: `tools/dev_submit_tb_smoke_job.py`. + +What it does: + +1. **Use the production launcher path** (`gigl.src.common.vertex_ai_launcher.launch_single_pool_job`) — *not* `VertexAIService.launch_job` directly — so the same `_build_job_config` env-var injection runs as in production. Codex Issue 2 fix. +2. Constructs a small `VertexAiResourceConfig` proto inline: + - `machine_type="n1-standard-2"`, `gpu_type="ACCELERATOR_TYPE_UNSPECIFIED"`, `gpu_limit=0`, `num_replicas=1`, `tensorboard_resource_name=`. +3. Constructs a small `GiglResourceConfig` proto with that trainer config + `shared_resource_config.common_compute_config` populated from CLI flags. +4. Calls `launch_single_pool_job(...)` with: + - `process_command="python -m gigl.utils.dev.tb_smoke_main"` — a tiny module added in the same commit; reads env vars, instantiates `TensorBoardWriter.from_env(enabled=True)`, writes 3 scalar events at steps 0/1/2, sleeps ~30s, exits. + - `tensorboard_logs_uri = GcsUri("gs:///tb-smoke//logs/")` — drives `base_output_dir` via the existing helper at `gigl/src/common/vertex_ai_launcher.py:_get_base_output_dir_from_tensorboard_logs_uri`. + - `tensorboard_experiment_name` from a CLI flag (or `None`). +5. After completion, queries the Vertex AI APIs: + - `aiplatform.TensorboardExperiment.list(tensorboard_name=)` (`tensorboard_resource.py:518`) to confirm both expected experiments exist (the per-job auto-named one always; the user-named one only when the experiment-name flag was passed). + - For each expected run, `aiplatform.TensorboardTimeSeries.list(tensorboard_run_name=)` (`tensorboard_resource.py:1264`) to confirm at least one scalar tag exists. Codex Issue 4 fix — `TensorboardRun.list` alone only confirms run *existence*, not that scalars were ingested. +6. Prints both TB UI URLs (per-job and named) for manual inspection. + +Required CLI flags: `--project`, `--region`, `--service-account`, `--staging-bucket`, `--tensorboard` (full resource name), and optional `--experiment-name`, `--container-uri` (defaults to `DEFAULT_GIGL_RELEASE_SRC_IMAGE_CPU` from `gigl/common/constants.py:69`), `--dry-run`. + +Existing infrastructure to reuse: +- `gigl/src/common/vertex_ai_launcher.py:launch_single_pool_job` — production entry; running through this exercises env-var injection. +- `gigl/common/services/vertex_ai.py:VertexAiJobConfig` — config dataclass. +- `gigl/utils/tensorboard_writer.py:TensorBoardWriter` — same writer the trainers use. +- `aiplatform.TensorboardExperiment.list` / `aiplatform.TensorboardRun.list` / `aiplatform.TensorboardTimeSeries.list` — verification surfaces. +- `DEFAULT_GIGL_RELEASE_SRC_IMAGE_CPU` from `gigl/common/constants.py:69` — default container image. + +## Step-by-step plan + +Each step ends with a verification. + +### Step 1: revert `_submit_job` to always pass `tensorboard=` and refresh stale comments + +Production code: +- `gigl/common/services/vertex_ai.py:_submit_job` — set `submit_kwargs["tensorboard"] = job_config.tensorboard_resource_name` whenever `job_config.tensorboard_resource_name` is non-empty, regardless of `tensorboard_experiment_name`. Keep the experiment-name regex validation gate. +- `gigl/common/services/vertex_ai.py:150` — update the `VertexAiJobConfig.tensorboard_experiment_name` docstring to describe "auxiliary chief-rank uploader streams events to this experiment in addition to the per-job auto-named one." +- `gigl/src/common/vertex_ai_launcher.py:300` — update the comment block describing `_submit_job` behavior. +- `proto/snapchat/research/gbml/gbml_config.proto:204` — replace the `experiment=`-backed description with the new dual-uploader description; run `make compile_protos`. + +Tests: +- `tests/unit/src/common/vertex_ai_test.py` — rename `test_submit_job_skips_experiment_and_tensorboard_when_experiment_name_set` → `test_submit_job_passes_tensorboard_with_or_without_experiment_name`; assert `tensorboard=` is set in both branches. + +Verify: `make unit_test_py PY_TEST_FILES="vertex_ai_test.py"` passes; `make type_check` is clean. + +Commit: `vertex_ai: always pass tensorboard= so VAI job page links to TB`. + +### Step 2: inject `GIGL_TENSORBOARD_RUN_NAME` and consume it in the writer + +Production code: +- `gigl/src/common/vertex_ai_launcher.py:_build_job_config` — when `tensorboard_experiment_name` is set, also append `env_var.EnvVar(name="GIGL_TENSORBOARD_RUN_NAME", value=job_name.replace("_", "-"))` next to the existing two GIGL_TENSORBOARD_* env vars. (We pre-sanitize so the GCS subdir name and the SDK-derived run name agree.) +- `gigl/utils/tensorboard_writer.py:from_env` — if `GIGL_TENSORBOARD_RUN_NAME` is set, compute `effective_log_dir = os.path.join(AIP_TENSORBOARD_LOG_DIR, run_name)` and pass that to `tf.summary.create_file_writer`. Otherwise pass `AIP_TENSORBOARD_LOG_DIR` (today's behavior). +- `gigl/utils/tensorboard_writer.py:_maybe_start_uploader` — keep watching the **parent** `AIP_TENSORBOARD_LOG_DIR` (so the SDK's `LogdirLoader` discovers the run via `os.path.relpath(subdir, logdir)` as the subdir name). No `run_name_prefix`. + +Tests: +- `tests/unit/src/common/vertex_ai_launcher_test.py` — assert the GIGL_TENSORBOARD_RUN_NAME env var is injected when an experiment name is set; underscores in the job name become hyphens; not injected when experiment name is unset. +- `tests/unit/utils/tensorboard_writer_test.py` — when `GIGL_TENSORBOARD_RUN_NAME=my-run`: assert the writer's underlying file-writer was created for `/my-run/`; assert `start_upload_tb_log` called with `logdir=` and no `run_name_prefix`. When unset: writer uses parent dir directly (regression coverage for R3). + +Verify: `make unit_test_py PY_TEST_FILES="vertex_ai_launcher_test.py"`; `make unit_test_py PY_TEST_FILES="tensorboard_writer_test.py"`. + +Commit: `tensorboard: emit unique run names so multi-job comparison shows two runs`. + +### Step 3: harden trainer uploader lifecycle + +For each of: +- `examples/link_prediction/homogeneous_training.py` (`tensorboard_writer = TensorBoardWriter.from_env(...)` at line 364, `.close()` at line 621) +- `examples/link_prediction/heterogeneous_training.py` +- `examples/link_prediction/graph_store/homogeneous_training.py` +- `examples/link_prediction/graph_store/heterogeneous_training.py` + +Replace the assignment + later `.close()` pattern with `with TensorBoardWriter.from_env(enabled=is_chief_process) as tensorboard_writer:` wrapping the body. The writer already supports `__enter__`/`__exit__`; this just guarantees `end_upload_tb_log` runs even when training raises. + +If the writer is used at module scope across many functions (and a single `with` block would force a large diff), wrap the function that owns the training loop in `try/finally` and call `tensorboard_writer.close()` in `finally`. + +Tests: existing `make unit_test_py PY_TEST_FILES="tensorboard_writer_test.py"` already covers idempotent close. No new unit tests required (these example scripts are not unit-tested today). + +Verify: `make type_check`; manually re-read each modified entrypoint to confirm the writer's lifetime spans the entire training-loop scope. + +Commit: `examples: scope TensorBoardWriter to a context manager in all training entrypoints`. + +### Step 4: write `tools/dev_submit_tb_smoke_job.py` + `gigl/utils/dev/tb_smoke_main.py` + +- `gigl/utils/dev/tb_smoke_main.py`: new module. ~25 lines. Uses `TensorBoardWriter.from_env(enabled=True)` to write 3 scalar events (`{"smoke/value": float(step)}` at steps 0, 1, 2) inside a `with` block, then `time.sleep(30)` to let both uploaders flush. Module-level entry so it can be invoked with `python -m gigl.utils.dev.tb_smoke_main`. +- `tools/dev_submit_tb_smoke_job.py`: new top-level script. + - argparse for `--project`, `--region`, `--service-account`, `--staging-bucket`, `--tensorboard`, optional `--experiment-name`, `--container-uri`, `--dry-run`. + - Builds `VertexAiResourceConfig` and `GiglResourceConfig` protos inline (mirror the patterns in `tests/unit/src/common/vertex_ai_launcher_test.py:_create_gigl_resource_config_with_single_pool_inference` for shape). + - Calls `launch_single_pool_job(... vertex_ai_region=, tensorboard_logs_uri=GcsUri("gs:///tb-smoke//logs/"), tensorboard_experiment_name=)`. + - On `--dry-run`: print the resulting `VertexAiJobConfig` and exit 0. + - On real run: wait via `service.launch_job` (synchronous), then poll the verification APIs: + - `aiplatform.TensorboardExperiment.list(tensorboard_name=)` — assert per-job experiment with the job's numeric ID exists; assert user-experiment exists iff flag passed. + - For each expected experiment: `aiplatform.TensorboardRun.list(tensorboard_experiment_name=)` — assert at least one run, and (for `--experiment-name` mode) that the run name matches the sanitized job name. + - For each expected run: `aiplatform.TensorboardTimeSeries.list(tensorboard_run_name=)` — assert at least one time series with at least one tag (Codex Issue 4 fix). + - Print both UI URLs. + +Verify (offline): `python tools/dev_submit_tb_smoke_job.py --dry-run --project=… --region=… --service-account=… --staging-bucket=gs://… --tensorboard=projects/…/tensorboards/… --experiment-name=tb-smoke-multi` prints the `VertexAiJobConfig` and exits 0 without touching GCP. + +Commit: `tools: add dev_submit_tb_smoke_job + tb_smoke_main for fast TB iteration`. + +### Step 5: smoke-validate R1 + R3 (no experiment name) + +Run the smoke script without `--experiment-name`. After completion (≤2 min): +- The Vertex AI job UI for the run shows "Open TensorBoard"; clicking it loads the per-job experiment (R1). +- The per-job experiment exists with one run named `default` (R3 — no `GIGL_TENSORBOARD_RUN_NAME` injected, the writer falls back to writing to the parent logdir). +- No experiment with the user-named slug exists. + +If R3 fails, suspect Step 1's submit-kwargs change. The smoke loop iteration is the diagnostic surface. + +### Step 6: smoke-validate R1 + R2 (with experiment name) + +Run twice with the same flag: `--experiment-name=tb-smoke-multi`. After both complete: +- Both job pages still show working "Open TensorBoard" links (R1). +- Two per-job experiments exist (one per job, auto-named). +- The `tb-smoke-multi` experiment exists with **two runs**, named after each sanitized job name. +- Each of those runs has at least one `TensorboardTimeSeries` for the `smoke/value` tag. + +If R2 fails (e.g., one merged run instead of two), suspect Step 2's run-name plumbing — iterate within the smoke loop, not the full pipeline. + +### Step 7: full-pipeline regression test + +With R1 + R2 verified at the smoke layer, kick off one real homogeneous training run with `tensorboardExperimentName: "homogeneous-link-prediction-comparison"` (the value updated in Step 1's config edit, Codex Issue 5). Verify: +- "Open TensorBoard" link works on the job page (R1). +- The named experiment shows the run with all trainer scalar tags (R2). + +### Step 8: shipping checklist + +- `make unit_test_py` and `make type_check` clean. +- The original branch plan's Task 11 manual smoke test gate is now satisfied by Steps 5–7. +- `make format`. +- Optionally request final code review on the post-step-1 diff via `superpowers:code-reviewer`. +- Open the PR. + +### Step 0 (close-out, runs after exit-plan-mode): relocate this plan to `docs/plans/` + +`mv /home/kmontemayor/.claude/plans/crystalline-giggling-backus.md docs/plans/20260505-tb-multi-job-iteration.md` — and add a note in the new file's header pointing at the supersedence relationship with `docs/plans/20260504-tb-experiment-name-proto.md`. Per CLAUDE.md plan conventions (`CLAUDE.md:252`, Codex Issue 7). + +## Verification summary + +| Step | Type | Cost | What it proves | +|------|------|------|----------------| +| 1, 2 | Unit tests + `type_check` | seconds | Code paths aren't broken; env-var injection + writer subdir wiring correct | +| 3 | Read-through + `type_check` | seconds | Lifecycle hardening compiles | +| 4 | `--dry-run` of smoke script | seconds | Script wires correctly without submitting | +| 5 | One smoke run (no experiment-name) | ~1–2 min | R1 + R3 | +| 6 | Two smoke runs (same experiment-name) | ~3–4 min | R1 + R2 (run identity, scalar ingestion) | +| 7 | One real homogeneous training run | ~5–15 min | Full pipeline + R1 + R2 | + +Total budget for design-and-verify: ~30 minutes of cluster time. + +## Risks & open questions + +- **The chief-rank uploader thread is not a daemon** (`uploader_tracker.py:162`). Process exit will not reap it; `end_upload_tb_log()` MUST be called. Step 3 enforces this via `with` blocks in all four trainer entrypoints. Codex Issue 3 fix — the original plan's claim that "the SDK's uploader thread is daemon" was wrong. +- **Race between two uploaders on the same logdir.** Both uploaders read events from GCS; neither writes. Each maintains its own `LogdirLoader` state. No conflict observed in the SDK source. Step 5 + 6 confirm in practice. +- **Quota.** Two uploaders ≈ 2× ingestion request rate per opt-in job. Acceptable; revisit only on 429s. +- **GCS subdir vs logdir parent.** The chief-rank uploader watches `AIP_TENSORBOARD_LOG_DIR` (parent) and discovers the run as the subdir name. The server-side auto-uploader does the same. If we ever switch to writing events directly at the parent (no subdir), R2 collapses back to a single `default` run. Step 2's tests pin both ends. +- **`make compile_protos` regenerates Scala stubs as well.** The proto comment update in Step 1 will create a noisy diff in `scala/...` and `scala_spark35/...`. Acceptable. + +## Roll-back + +If Steps 5 or 6 fail and the chief-rank uploader is the cause, set just `tensorboard=` on submit and stop injecting any `GIGL_TENSORBOARD_*` env vars. Falls back to R1-only (per-job TB), losing R2 — back to the state before this branch, with no regression. + +## Codex review traceability + +Issues 1–7 from `.claude/tmp/codex-verify/20260505-155740-plan-crystalline-giggling-backus/review.md`: + +| Issue | Severity | Addressed in | +|-------|----------|--------------| +| 1 — Run identity collapse | High | Step 2 (subdir-based run names, no `run_name_prefix`) | +| 2 — Smoke script bypasses env injection | High | Step 4 (smoke script uses `launch_single_pool_job`) | +| 3 — Uploader thread not daemon | High | Step 3 (`with` wrapping in all four trainers) | +| 4 — TimeSeries verification | Medium | Step 4 (smoke script asserts `TensorboardTimeSeries.list`) | +| 5 — Wrong experiment-name in Step 5 | Medium | Step 1 (config update from `kmonte-test-experiment` → `homogeneous-link-prediction-comparison`) | +| 6 — Stale comments / proto doc | Low | Step 1 (vertex_ai.py:150, vertex_ai_launcher.py:300, gbml_config.proto:204) | +| 7 — Plan-file location convention | Low | Step 0 (move to `docs/plans/20260505-tb-multi-job-iteration.md`) | diff --git a/examples/link_prediction/configs/e2e_hom_cora_sup_task_config.yaml b/examples/link_prediction/configs/e2e_hom_cora_sup_task_config.yaml index f0088eaaf..d552a6aed 100644 --- a/examples/link_prediction/configs/e2e_hom_cora_sup_task_config.yaml +++ b/examples/link_prediction/configs/e2e_hom_cora_sup_task_config.yaml @@ -23,7 +23,7 @@ trainerConfig: # comparable runs on a single TensorBoard page. Requires # GiglResourceConfig.trainerResourceConfig...tensorboardResourceName to be # set. See proto/snapchat/research/gbml/gbml_config.proto for details. - tensorboardExperimentName: "homogeneous-link-prediction-comparison" + tensorboardExperimentName: "kmonte-test-experiment" inferencerConfig: inferencerArgs: # Example argument to inferencer diff --git a/gigl/common/services/vertex_ai.py b/gigl/common/services/vertex_ai.py index 3e1f764cd..8ca383c84 100644 --- a/gigl/common/services/vertex_ai.py +++ b/gigl/common/services/vertex_ai.py @@ -147,11 +147,15 @@ class VertexAiJobConfig: ``AIP_TENSORBOARD_LOG_DIR`` from this directory. tensorboard_resource_name: Optional existing Vertex AI TensorBoard resource to attach to the job. - tensorboard_experiment_name: Optional Vertex AI Experiment name. When - set, the job is submitted with ``experiment=`` (mutually - exclusive with ``tensorboard_resource_name`` on submit; see - ``_submit_job``). Multiple jobs sharing this name appear as - comparable runs on a single TensorBoard page. + tensorboard_experiment_name: Optional Vertex AI ``TensorboardExperiment`` + name for cross-job comparison. When set, the launcher injects + ``GIGL_TENSORBOARD_*`` env vars into the worker container; the + trainer's chief rank then streams events to this experiment via + ``aiplatform.start_upload_tb_log`` *in addition to* Vertex's + built-in per-job auto-upload (which is gated on + ``tensorboard_resource_name`` and is what the "Open TensorBoard" + link on the VAI job page resolves to). Multiple jobs sharing this + name appear as comparable runs on a single TensorBoard page. """ job_name: str @@ -389,16 +393,14 @@ def _submit_job( f"is not a valid Vertex AI Experiment ID; it must match " f"{_VERTEX_RESOURCE_ID_PATTERN.pattern}." ) - # Don't set ``experiment=`` or ``tensorboard=`` on submit. The - # SDK forbids both together, ``experiment=`` alone does NOT - # trigger TB streaming (Vertex's auto-uploader is gated on - # ``tensorboard=``), and ``tensorboard=`` alone uploads to a - # job-scoped experiment we can't rename. Instead, the launcher - # has injected ``GIGL_TENSORBOARD_*`` env vars into the worker - # container, and the trainer's ``TensorBoardWriter.from_env`` - # runs ``aiplatform.start_upload_tb_log`` on the chief rank to - # stream events to the user-chosen experiment. - else: + if job_config.tensorboard_resource_name: + # Always pass ``tensorboard=`` whenever a TB resource is + # configured, so the Vertex AI job page shows an "Open TensorBoard" + # link to the auto-named per-job experiment. When + # ``tensorboard_experiment_name`` is also set, the launcher has + # injected ``GIGL_TENSORBOARD_*`` env vars and the trainer's chief + # rank additionally streams events to the user-named experiment + # via ``aiplatform.start_upload_tb_log``. submit_kwargs["tensorboard"] = job_config.tensorboard_resource_name job.submit(**submit_kwargs) job.wait_for_resource_creation() diff --git a/gigl/src/common/vertex_ai_launcher.py b/gigl/src/common/vertex_ai_launcher.py index 99139b154..0c58e75c0 100644 --- a/gigl/src/common/vertex_ai_launcher.py +++ b/gigl/src/common/vertex_ai_launcher.py @@ -297,12 +297,13 @@ def _build_job_config( else None ) - # When the user opted into a stable Vertex AI TensorboardExperiment, - # ``VertexAIService._submit_job`` does NOT pass ``tensorboard=`` on submit - # (Vertex's auto-uploader would route to a job-scoped experiment we can't - # rename). Instead, the chief-rank trainer streams events itself via - # ``aiplatform.start_upload_tb_log``. Inject the resource name and - # experiment name as container env vars so the trainer can find them. + # When the user opted into a stable Vertex AI TensorboardExperiment via + # ``tensorboard_experiment_name``, inject env vars into the worker so the + # chief-rank trainer can stream events directly to that experiment via + # ``aiplatform.start_upload_tb_log``. (Vertex's built-in auto-uploader + # still runs in parallel — see ``VertexAIService._submit_job`` — and + # writes to a per-job auto-named experiment so the "Open TensorBoard" + # link on the VAI job page resolves correctly.) container_env_vars = list(env_vars) if ( tensorboard_experiment_name diff --git a/proto/snapchat/research/gbml/gbml_config.proto b/proto/snapchat/research/gbml/gbml_config.proto index 605ee500a..b2f5c7aa1 100644 --- a/proto/snapchat/research/gbml/gbml_config.proto +++ b/proto/snapchat/research/gbml/gbml_config.proto @@ -201,14 +201,15 @@ message GbmlConfig { // Weather to log to tensorboard or not (defaults to false) bool should_log_to_tensorboard = 12; - // Optional. When set, the trainer's CustomJob is submitted as a run of - // a Vertex AI Experiment with this name (instead of attaching the raw - // Tensorboard resource directly). Multiple jobs that share the same - // value land in the same backing TensorboardExperiment, so they appear - // as comparable runs on one TensorBoard page. Requires - // GiglResourceConfig...tensorboard_resource_name to be set; that TB - // becomes the experiment's backing TB. Allowed characters: lowercase - // letters, digits, hyphens (Vertex AI Experiment ID rules). + // Optional. When set, the trainer's chief rank streams events to a + // TensorboardExperiment with this name on the configured Tensorboard + // resource, in addition to Vertex's built-in per-job auto-upload. + // Multiple jobs that share the same value land in the same + // TensorboardExperiment, so they appear as comparable runs on one + // TensorBoard page. Requires + // GiglResourceConfig...tensorboard_resource_name to be set. Allowed + // characters: lowercase letters, digits, hyphens (Vertex AI Experiment + // ID rules). string tensorboard_experiment_name = 14; // Configuration for GraphStore storage. diff --git a/scala/common/src/main/scala/snapchat/research/gbml/gbml_config/GbmlConfig.scala b/scala/common/src/main/scala/snapchat/research/gbml/gbml_config/GbmlConfig.scala index b7f0507e5..658cd15ae 100644 --- a/scala/common/src/main/scala/snapchat/research/gbml/gbml_config/GbmlConfig.scala +++ b/scala/common/src/main/scala/snapchat/research/gbml/gbml_config/GbmlConfig.scala @@ -3967,14 +3967,15 @@ object GbmlConfig extends scalapb.GeneratedMessageCompanion[snapchat.research.gb * @param shouldLogToTensorboard * Weather to log to tensorboard or not (defaults to false) * @param tensorboardExperimentName - * Optional. When set, the trainer's CustomJob is submitted as a run of - * a Vertex AI Experiment with this name (instead of attaching the raw - * Tensorboard resource directly). Multiple jobs that share the same - * value land in the same backing TensorboardExperiment, so they appear - * as comparable runs on one TensorBoard page. Requires - * GiglResourceConfig...tensorboard_resource_name to be set; that TB - * becomes the experiment's backing TB. Allowed characters: lowercase - * letters, digits, hyphens (Vertex AI Experiment ID rules). + * Optional. When set, the trainer's chief rank streams events to a + * TensorboardExperiment with this name on the configured Tensorboard + * resource, in addition to Vertex's built-in per-job auto-upload. + * Multiple jobs that share the same value land in the same + * TensorboardExperiment, so they appear as comparable runs on one + * TensorBoard page. Requires + * GiglResourceConfig...tensorboard_resource_name to be set. Allowed + * characters: lowercase letters, digits, hyphens (Vertex AI Experiment + * ID rules). */ @SerialVersionUID(0L) final case class TrainerConfig( diff --git a/scala_spark35/common/src/main/scala/snapchat/research/gbml/gbml_config/GbmlConfig.scala b/scala_spark35/common/src/main/scala/snapchat/research/gbml/gbml_config/GbmlConfig.scala index b7f0507e5..658cd15ae 100644 --- a/scala_spark35/common/src/main/scala/snapchat/research/gbml/gbml_config/GbmlConfig.scala +++ b/scala_spark35/common/src/main/scala/snapchat/research/gbml/gbml_config/GbmlConfig.scala @@ -3967,14 +3967,15 @@ object GbmlConfig extends scalapb.GeneratedMessageCompanion[snapchat.research.gb * @param shouldLogToTensorboard * Weather to log to tensorboard or not (defaults to false) * @param tensorboardExperimentName - * Optional. When set, the trainer's CustomJob is submitted as a run of - * a Vertex AI Experiment with this name (instead of attaching the raw - * Tensorboard resource directly). Multiple jobs that share the same - * value land in the same backing TensorboardExperiment, so they appear - * as comparable runs on one TensorBoard page. Requires - * GiglResourceConfig...tensorboard_resource_name to be set; that TB - * becomes the experiment's backing TB. Allowed characters: lowercase - * letters, digits, hyphens (Vertex AI Experiment ID rules). + * Optional. When set, the trainer's chief rank streams events to a + * TensorboardExperiment with this name on the configured Tensorboard + * resource, in addition to Vertex's built-in per-job auto-upload. + * Multiple jobs that share the same value land in the same + * TensorboardExperiment, so they appear as comparable runs on one + * TensorBoard page. Requires + * GiglResourceConfig...tensorboard_resource_name to be set. Allowed + * characters: lowercase letters, digits, hyphens (Vertex AI Experiment + * ID rules). */ @SerialVersionUID(0L) final case class TrainerConfig( diff --git a/snapchat/research/gbml/gbml_config_pb2.pyi b/snapchat/research/gbml/gbml_config_pb2.pyi index 914aaa202..f60ac11bb 100644 --- a/snapchat/research/gbml/gbml_config_pb2.pyi +++ b/snapchat/research/gbml/gbml_config_pb2.pyi @@ -558,14 +558,15 @@ class GbmlConfig(google.protobuf.message.Message): should_log_to_tensorboard: builtins.bool """Weather to log to tensorboard or not (defaults to false)""" tensorboard_experiment_name: builtins.str - """Optional. When set, the trainer's CustomJob is submitted as a run of - a Vertex AI Experiment with this name (instead of attaching the raw - Tensorboard resource directly). Multiple jobs that share the same - value land in the same backing TensorboardExperiment, so they appear - as comparable runs on one TensorBoard page. Requires - GiglResourceConfig...tensorboard_resource_name to be set; that TB - becomes the experiment's backing TB. Allowed characters: lowercase - letters, digits, hyphens (Vertex AI Experiment ID rules). + """Optional. When set, the trainer's chief rank streams events to a + TensorboardExperiment with this name on the configured Tensorboard + resource, in addition to Vertex's built-in per-job auto-upload. + Multiple jobs that share the same value land in the same + TensorboardExperiment, so they appear as comparable runs on one + TensorBoard page. Requires + GiglResourceConfig...tensorboard_resource_name to be set. Allowed + characters: lowercase letters, digits, hyphens (Vertex AI Experiment + ID rules). """ @property def graph_store_storage_config(self) -> global___GbmlConfig.GraphStoreStorageConfig: ... diff --git a/tests/unit/src/common/vertex_ai_test.py b/tests/unit/src/common/vertex_ai_test.py index 50075e679..ce9c28b1d 100644 --- a/tests/unit/src/common/vertex_ai_test.py +++ b/tests/unit/src/common/vertex_ai_test.py @@ -74,14 +74,14 @@ def test_vertex_ai_job_config_carries_experiment_name(self) -> None: @patch("gigl.common.services.vertex_ai.aiplatform.CustomJob") @patch("gigl.common.services.vertex_ai.aiplatform.init") - def test_submit_job_skips_experiment_and_tensorboard_when_experiment_name_set( + def test_submit_job_passes_tensorboard_with_or_without_experiment_name( self, mock_aiplatform_init: Mock, mock_custom_job_class: Mock, ) -> None: - """When tensorboard_experiment_name is set, submit passes neither - ``experiment`` nor ``tensorboard`` — the trainer streams events itself - via the chief-rank uploader. + """``tensorboard=`` is always passed when a TB resource is set, so the + VAI job page's "Open TensorBoard" link works. The chief-rank uploader + (driven by injected env vars) handles cross-job comparison separately. """ mock_job = Mock() mock_job.resource_name = "projects/test/locations/us-central1/customJobs/456" @@ -108,9 +108,12 @@ def test_submit_job_skips_experiment_and_tensorboard_when_experiment_name_set( mock_job.submit.assert_called_once() submit_kwargs = mock_job.submit.call_args.kwargs + self.assertEqual( + submit_kwargs["tensorboard"], + "projects/test/locations/us-central1/tensorboards/123", + ) self.assertNotIn("experiment", submit_kwargs) self.assertNotIn("experiment_run", submit_kwargs) - self.assertNotIn("tensorboard", submit_kwargs) @patch("gigl.common.services.vertex_ai.aiplatform.CustomJob") @patch("gigl.common.services.vertex_ai.aiplatform.init") From 31d3a3544b74d3fbcd51054a7436b5b789705cc0 Mon Sep 17 00:00:00 2001 From: kmontemayor Date: Tue, 5 May 2026 16:52:16 +0000 Subject: [PATCH 36/59] tensorboard: emit unique run names so multi-job comparison shows two runs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The chief-rank aiplatform.start_upload_tb_log uploader was watching AIP_TENSORBOARD_LOG_DIR with no run_name_prefix. The SDK's LogdirLoader maps root-logdir events to DEFAULT_RUN_NAME = "default", and the SDK silently merges into existing runs by name — so two jobs sharing the same TensorboardExperiment would collapse into one "default" run instead of producing two comparable runs (codex round-2 issue 1). Approach: - Launcher injects GIGL_TENSORBOARD_RUN_NAME = sanitized(job_name) + utc timestamp. The timestamp ensures launch-unique even across reruns of the same applied_task_identifier. - Writer uses // as the file-writer directory; the uploader still watches the parent so the SDK's LogdirLoader discovers the subdir as a TensorboardRun via os.path.relpath. - Sanitization mirrors the SDK's reformat_run_name regex so the GCS subdir name and the SDK-derived run name agree (codex round-2 issue 5). - from_env constructs the file writer first, then attempts to start the uploader; on failure the writer is closed before re-raising — no leaked uploader thread (codex round-2 issue 6). - launch_single_pool_job now returns the submitted CustomJob so the smoke script can look up the per-job TensorboardExperiment (codex round-2 issue 3). --- gigl/src/common/vertex_ai_launcher.py | 54 +++++++++++++++- gigl/utils/tensorboard_writer.py | 52 +++++++++++---- .../src/common/vertex_ai_launcher_test.py | 46 ++++++++++++- tests/unit/utils/tensorboard_writer_test.py | 64 +++++++++++++++++-- 4 files changed, 196 insertions(+), 20 deletions(-) diff --git a/gigl/src/common/vertex_ai_launcher.py b/gigl/src/common/vertex_ai_launcher.py index 0c58e75c0..f9cd7db67 100644 --- a/gigl/src/common/vertex_ai_launcher.py +++ b/gigl/src/common/vertex_ai_launcher.py @@ -1,8 +1,11 @@ """Shared functionality for launching Vertex AI jobs for training and inference.""" +import datetime +import re from collections.abc import Mapping from typing import Final, Optional +from google.cloud import aiplatform from google.cloud.aiplatform_v1.types import ( ReservationAffinity, Scheduling, @@ -39,6 +42,37 @@ {"NO_RESERVATION", "ANY_RESERVATION", "SPECIFIC_RESERVATION"} ) +# The SDK TensorBoard uploader rewrites run names by replacing every char +# outside this character class with ``-`` +# (.venv/.../tensorboard/uploader_utils.py:46). We pre-sanitize the GCS +# subdir name to match what the SDK will produce, so the directory and +# the resulting TensorboardRun ID agree. +_VERTEX_RUN_NAME_REPLACE_PATTERN: Final[re.Pattern[str]] = re.compile( + r"[^a-zA-Z0-9\n-]" +) + + +def _sanitize_for_vertex_run(value: str) -> str: + """Coerce ``value`` into the SDK's TensorboardRun-name character class. + + Mirrors ``google.cloud.aiplatform.tensorboard.uploader_utils.reformat_run_name`` + so the GCS subdir we create and the SDK-derived run name match. + """ + return _VERTEX_RUN_NAME_REPLACE_PATTERN.sub("-", value) + + +def _build_unique_run_name(job_name: str) -> str: + """Return a launch-unique, sanitized run name for ``job_name``. + + The display ``job_name`` is not guaranteed unique across reruns of the + same task identifier, and the SDK reuses an existing + ``TensorboardRun`` by name (silently merging events). We append a UTC + timestamp so two launches of the same task always produce two distinct + runs in a shared experiment. + """ + timestamp = datetime.datetime.utcnow().strftime("%Y%m%d-%H%M%S") + return _sanitize_for_vertex_run(f"{job_name}-{timestamp}") + def launch_single_pool_job( vertex_ai_resource_config: VertexAiResourceConfig, @@ -54,7 +88,7 @@ def launch_single_pool_job( vertex_ai_region: str, tensorboard_logs_uri: Optional[Uri] = None, tensorboard_experiment_name: Optional[str] = None, -) -> None: +) -> aiplatform.CustomJob: """Launch a single pool job on Vertex AI. Args: @@ -74,6 +108,11 @@ def launch_single_pool_job( the trainer's CustomJob is submitted as a run of the named experiment so multiple jobs sharing the name can be compared on a single TensorBoard page. See ``VertexAiJobConfig.tensorboard_experiment_name``. + + Returns: + The submitted ``aiplatform.CustomJob``. Useful for callers that need + the job's resource name to look up downstream artifacts (e.g. the + per-job ``TensorboardExperiment``). """ if component not in _LAUNCHABLE_COMPONENTS: raise ValueError( @@ -108,7 +147,7 @@ def launch_single_pool_job( service_account=resource_config_wrapper.service_account_email, staging_bucket=resource_config_wrapper.temp_assets_regional_bucket_path.uri, ) - vertex_ai_service.launch_job(job_config=job_config) + return vertex_ai_service.launch_job(job_config=job_config) def launch_graph_store_enabled_job( @@ -304,6 +343,13 @@ def _build_job_config( # still runs in parallel — see ``VertexAIService._submit_job`` — and # writes to a per-job auto-named experiment so the "Open TensorBoard" # link on the VAI job page resolves correctly.) + # + # ``GIGL_TENSORBOARD_RUN_NAME`` carries a launch-unique, sanitized run + # name. The writer creates a subdirectory of ``AIP_TENSORBOARD_LOG_DIR`` + # with this name; the SDK ``LogdirLoader`` then surfaces it as a distinct + # ``TensorboardRun`` in the named experiment, so two jobs sharing + # ``tensorboard_experiment_name`` show up as two runs (instead of merging + # into one ``default`` run). container_env_vars = list(env_vars) if ( tensorboard_experiment_name @@ -319,6 +365,10 @@ def _build_job_config( name="GIGL_TENSORBOARD_EXPERIMENT_NAME", value=tensorboard_experiment_name, ), + env_var.EnvVar( + name="GIGL_TENSORBOARD_RUN_NAME", + value=_build_unique_run_name(job_name), + ), ] ) diff --git a/gigl/utils/tensorboard_writer.py b/gigl/utils/tensorboard_writer.py index bf09556a5..aecefa121 100644 --- a/gigl/utils/tensorboard_writer.py +++ b/gigl/utils/tensorboard_writer.py @@ -20,15 +20,18 @@ # Set by GiGL's launcher (``gigl/src/common/vertex_ai_launcher.py``) when the # user requested a stable Vertex AI ``TensorboardExperiment`` for cross-job -# comparison. When both env vars are set on the chief rank, the writer also +# comparison. When all three are set on the chief rank, the writer also # starts a background uploader (``aiplatform.start_upload_tb_log``) that -# streams events from the log dir to that experiment's backing TB. Without -# these, the writer just writes files to ``AIP_TENSORBOARD_LOG_DIR`` and no -# upload happens. +# streams events from the parent log dir to that experiment under the +# configured ``Tensorboard`` instance, with the run-name subdir surfacing +# as a distinct ``TensorboardRun``. Without these, the writer just writes +# files to ``AIP_TENSORBOARD_LOG_DIR`` and only Vertex's built-in +# auto-uploader (gated on ``jobSpec.tensorboard``) ingests them. _GIGL_TENSORBOARD_RESOURCE_NAME_ENV_KEY: Final[str] = "GIGL_TENSORBOARD_RESOURCE_NAME" _GIGL_TENSORBOARD_EXPERIMENT_NAME_ENV_KEY: Final[str] = ( "GIGL_TENSORBOARD_EXPERIMENT_NAME" ) +_GIGL_TENSORBOARD_RUN_NAME_ENV_KEY: Final[str] = "GIGL_TENSORBOARD_RUN_NAME" _TENSORBOARD_RESOURCE_NAME_PATTERN: Final[re.Pattern[str]] = re.compile( r"^projects/(?P[^/]+)" @@ -86,10 +89,17 @@ def from_env(cls, *, enabled: bool = True) -> "TensorBoardWriter": ``RuntimeError`` rather than silently no-op'ing. The env var is populated by Vertex AI from ``CustomJobSpec.baseOutputDirectory`` (see the references in this module's header). + - If ``GIGL_TENSORBOARD_RUN_NAME`` is set, events are written to + ``//`` so the SDK uploader's + ``LogdirLoader`` discovers the subdir as a distinct + ``TensorboardRun`` (instead of merging into the SDK's hardcoded + ``DEFAULT_RUN_NAME = "default"``). The launcher injects this env + var when the user opts into ``tensorboard_experiment_name``. - If ``GIGL_TENSORBOARD_RESOURCE_NAME`` and ``GIGL_TENSORBOARD_EXPERIMENT_NAME`` are also set, this also starts a background ``aiplatform`` uploader that streams events from the - log dir to the named ``TensorboardExperiment`` under the configured + PARENT log dir (so the run-name subdir surfaces as a run) to the + named ``TensorboardExperiment`` under the configured ``Tensorboard`` instance. The uploader is shut down on :meth:`close`. @@ -108,17 +118,30 @@ def from_env(cls, *, enabled: bool = True) -> "TensorBoardWriter": """ if not enabled: return cls(log_dir=None) - log_dir = os.environ.get(_VERTEX_TENSORBOARD_LOG_DIR_ENV_KEY) - if not log_dir: + parent_log_dir = os.environ.get(_VERTEX_TENSORBOARD_LOG_DIR_ENV_KEY) + if not parent_log_dir: raise RuntimeError( f"{_VERTEX_TENSORBOARD_LOG_DIR_ENV_KEY} is not set. " "TensorBoardWriter.from_env() requires the trainer to run as " "a Vertex AI CustomJob with baseOutputDirectory configured. " "See https://cloud.google.com/vertex-ai/docs/reference/rest/v1/CustomJobSpec#FIELDS.base_output_directory." ) + run_name = os.environ.get(_GIGL_TENSORBOARD_RUN_NAME_ENV_KEY) + effective_log_dir = ( + os.path.join(parent_log_dir, run_name) if run_name else parent_log_dir + ) - upload_started = _maybe_start_uploader(log_dir=log_dir) - return cls(log_dir=log_dir, upload_started=upload_started) + # Construct the file writer FIRST. If TF construction fails we don't + # want a leaked uploader thread keeping the (non-daemon) process + # alive. See codex review round 2, issue 6. + instance = cls(log_dir=effective_log_dir, upload_started=False) + try: + if _maybe_start_uploader(parent_log_dir=parent_log_dir): + instance._upload_started = True + except BaseException: + instance.close() + raise + return instance def log(self, metrics: dict[str, float], step: int) -> None: """Write each metric scalar at ``step`` and flush. @@ -161,14 +184,19 @@ def __exit__(self, *_exc: object) -> None: self.close() -def _maybe_start_uploader(*, log_dir: str) -> bool: +def _maybe_start_uploader(*, parent_log_dir: str) -> bool: """Start the aiplatform TB uploader iff the GiGL env vars are present. + Watches ``parent_log_dir`` (not the run-name subdir under it), so the + SDK's ``LogdirLoader`` discovers each run via + ``os.path.relpath(subdir, parent_log_dir)``. + Returns ``True`` if the uploader was started (caller must arrange for ``aiplatform.end_upload_tb_log`` on shutdown), ``False`` otherwise. Args: - log_dir: Directory the uploader watches for new event files. + parent_log_dir: The ``AIP_TENSORBOARD_LOG_DIR`` value — i.e. the + directory whose children are run-name subdirectories. Raises: ValueError: If ``GIGL_TENSORBOARD_RESOURCE_NAME`` is set but does not @@ -198,6 +226,6 @@ def _maybe_start_uploader(*, log_dir: str) -> bool: aiplatform.start_upload_tb_log( tensorboard_id=match["tensorboard_id"], tensorboard_experiment_name=experiment_name, - logdir=log_dir, + logdir=parent_log_dir, ) return True diff --git a/tests/unit/src/common/vertex_ai_launcher_test.py b/tests/unit/src/common/vertex_ai_launcher_test.py index 0579f0389..b71c06889 100644 --- a/tests/unit/src/common/vertex_ai_launcher_test.py +++ b/tests/unit/src/common/vertex_ai_launcher_test.py @@ -1,5 +1,6 @@ """Unit tests for vertex_ai_launcher module.""" +import time from unittest.mock import Mock, patch from absl.testing import absltest @@ -501,7 +502,7 @@ def test_build_job_config_injects_gigl_tensorboard_env_vars(self) -> None: tensorboard_resource_name="projects/p/locations/us/tensorboards/1", ) cfg = _build_job_config( - job_name="job", + job_name="gigl_train_some_task", task_config_uri=Uri("gs://b/task.yaml"), resource_config_uri=Uri("gs://b/resource.yaml"), command_str="python -m gigl.src.training.v2.glt_trainer", @@ -519,6 +520,48 @@ def test_build_job_config_injects_gigl_tensorboard_env_vars(self) -> None: "projects/p/locations/us/tensorboards/1", ) self.assertEqual(env["GIGL_TENSORBOARD_EXPERIMENT_NAME"], "my-comparison") + # GIGL_TENSORBOARD_RUN_NAME must be sanitized (underscores in the + # job_name become hyphens) and carry a launch-unique timestamp suffix. + run_name = env["GIGL_TENSORBOARD_RUN_NAME"] + self.assertRegex( + run_name, r"^gigl-train-some-task-\d{8}-\d{6}$" + ) + + def test_build_job_config_run_name_is_unique_per_call(self) -> None: + """Two builds of the same job_name produce two distinct run names.""" + resource_config = gigl_resource_config_pb2.VertexAiResourceConfig( + machine_type="n1-standard-4", + gpu_type="ACCELERATOR_TYPE_UNSPECIFIED", + gpu_limit=0, + num_replicas=1, + tensorboard_resource_name="projects/p/locations/us/tensorboards/1", + ) + kwargs = dict( + job_name="gigl_train_same_name", + task_config_uri=Uri("gs://b/task.yaml"), + resource_config_uri=Uri("gs://b/resource.yaml"), + command_str="python -m gigl.src.training.v2.glt_trainer", + args={}, + use_cuda=False, + container_uri="gcr.io/p/img", + vertex_ai_resource_config=resource_config, + env_vars=[], + tensorboard_logs_uri=Uri("gs://b/run/logs/"), + tensorboard_experiment_name="my-comparison", + ) + first = _build_job_config(**kwargs) # type: ignore[arg-type] + # Sleep one second so the timestamp suffix changes deterministically. + time.sleep(1) + second = _build_job_config(**kwargs) # type: ignore[arg-type] + + def _run_name(cfg) -> str: + return next( + ev.value + for ev in cfg.environment_variables or [] + if ev.name == "GIGL_TENSORBOARD_RUN_NAME" + ) + + self.assertNotEqual(_run_name(first), _run_name(second)) def test_build_job_config_no_gigl_env_vars_when_experiment_name_unset( self, @@ -548,6 +591,7 @@ def test_build_job_config_no_gigl_env_vars_when_experiment_name_unset( env_names = {ev.name for ev in cfg.environment_variables or []} self.assertNotIn("GIGL_TENSORBOARD_RESOURCE_NAME", env_names) self.assertNotIn("GIGL_TENSORBOARD_EXPERIMENT_NAME", env_names) + self.assertNotIn("GIGL_TENSORBOARD_RUN_NAME", env_names) if __name__ == "__main__": diff --git a/tests/unit/utils/tensorboard_writer_test.py b/tests/unit/utils/tensorboard_writer_test.py index 3086483fc..27bef083b 100644 --- a/tests/unit/utils/tensorboard_writer_test.py +++ b/tests/unit/utils/tensorboard_writer_test.py @@ -18,7 +18,7 @@ def test_from_env_returns_noop_when_disabled(self) -> None: with patch.dict( os.environ, {"AIP_TENSORBOARD_LOG_DIR": "gs://vertex-managed/logs"}, - clear=False, + clear=True, ): with patch( "gigl.utils.tensorboard_writer.tf.summary.create_file_writer" @@ -29,11 +29,11 @@ def test_from_env_returns_noop_when_disabled(self) -> None: mock_create_file_writer.assert_not_called() - def test_from_env_uses_vertex_env_var(self) -> None: + def test_from_env_uses_parent_log_dir_when_no_run_name(self) -> None: with patch.dict( os.environ, {"AIP_TENSORBOARD_LOG_DIR": "gs://vertex-managed/logs"}, - clear=False, + clear=True, ): with patch( "gigl.utils.tensorboard_writer.tf.summary.create_file_writer" @@ -42,6 +42,25 @@ def test_from_env_uses_vertex_env_var(self) -> None: mock_create_file_writer.assert_called_once_with("gs://vertex-managed/logs") + def test_from_env_uses_run_name_subdir_when_set(self) -> None: + """Writer points TF at the subdir so the SDK uploader sees a distinct run.""" + with patch.dict( + os.environ, + { + "AIP_TENSORBOARD_LOG_DIR": "gs://vertex-managed/logs", + "GIGL_TENSORBOARD_RUN_NAME": "my-run", + }, + clear=True, + ): + with patch( + "gigl.utils.tensorboard_writer.tf.summary.create_file_writer" + ) as mock_create_file_writer: + TensorBoardWriter.from_env() + + mock_create_file_writer.assert_called_once_with( + "gs://vertex-managed/logs/my-run" + ) + def test_from_env_raises_when_env_var_missing(self) -> None: with patch.dict(os.environ, {}, clear=True): with patch( @@ -118,19 +137,21 @@ class TestTensorBoardWriterUploader(TestCase): _TB_RESOURCE = "projects/my-project/locations/us-central1/tensorboards/42" _EXPERIMENT = "my-comparison" - def test_uploader_starts_when_both_env_vars_present(self) -> None: + def test_uploader_starts_when_all_env_vars_present(self) -> None: + """Uploader watches the parent log dir; writer points at the run-name subdir.""" with patch.dict( os.environ, { "AIP_TENSORBOARD_LOG_DIR": self._LOG_DIR, "GIGL_TENSORBOARD_RESOURCE_NAME": self._TB_RESOURCE, "GIGL_TENSORBOARD_EXPERIMENT_NAME": self._EXPERIMENT, + "GIGL_TENSORBOARD_RUN_NAME": "my-run", }, clear=True, ): with patch( "gigl.utils.tensorboard_writer.tf.summary.create_file_writer" - ): + ) as mock_create_file_writer: with patch( "google.cloud.aiplatform.start_upload_tb_log" ) as mock_start, patch( @@ -141,9 +162,14 @@ def test_uploader_starts_when_both_env_vars_present(self) -> None: writer = TensorBoardWriter.from_env() writer.close() + mock_create_file_writer.assert_called_once_with( + f"{self._LOG_DIR}/my-run" + ) mock_init.assert_called_once_with( project="my-project", location="us-central1" ) + # Uploader watches the PARENT log dir so the run-name subdir is + # discovered as a TensorboardRun via os.path.relpath. mock_start.assert_called_once_with( tensorboard_id="42", tensorboard_experiment_name=self._EXPERIMENT, @@ -206,6 +232,34 @@ def test_uploader_skipped_for_disabled_writer(self) -> None: mock_start.assert_not_called() + def test_uploader_failure_after_writer_construction_closes_writer(self) -> None: + """If start_upload_tb_log raises, the TF file writer is closed and + the exception propagates — no leaked uploader thread, no half-built + writer. + """ + underlying_writer = Mock() + with patch.dict( + os.environ, + { + "AIP_TENSORBOARD_LOG_DIR": self._LOG_DIR, + "GIGL_TENSORBOARD_RESOURCE_NAME": self._TB_RESOURCE, + "GIGL_TENSORBOARD_EXPERIMENT_NAME": self._EXPERIMENT, + }, + clear=True, + ): + with patch( + "gigl.utils.tensorboard_writer.tf.summary.create_file_writer", + return_value=underlying_writer, + ): + with patch( + "google.cloud.aiplatform.start_upload_tb_log", + side_effect=RuntimeError("boom"), + ), patch("google.cloud.aiplatform.init"): + with self.assertRaises(RuntimeError): + TensorBoardWriter.from_env() + + underlying_writer.close.assert_called_once() + if __name__ == "__main__": absltest.main() From a5048fd8d4a05e398e75512f81475e311d57bdcc Mon Sep 17 00:00:00 2001 From: kmontemayor Date: Tue, 5 May 2026 16:57:36 +0000 Subject: [PATCH 37/59] examples: scope TensorBoardWriter to a try/finally block in all training entrypoints Co-Authored-By: Claude Opus 4.7 (1M context) --- .../graph_store/heterogeneous_training.py | 454 ++++++++--------- .../graph_store/homogeneous_training.py | 436 ++++++++-------- .../link_prediction/heterogeneous_training.py | 465 +++++++++--------- .../link_prediction/homogeneous_training.py | 458 ++++++++--------- 4 files changed, 911 insertions(+), 902 deletions(-) diff --git a/examples/link_prediction/graph_store/heterogeneous_training.py b/examples/link_prediction/graph_store/heterogeneous_training.py index 1c0e956a0..e8cb74d0c 100644 --- a/examples/link_prediction/graph_store/heterogeneous_training.py +++ b/examples/link_prediction/graph_store/heterogeneous_training.py @@ -463,262 +463,264 @@ def _training_process( is_chief_process = rank == 0 tensorboard_writer = TensorBoardWriter.from_env(enabled=is_chief_process) - loss_fn = RetrievalLoss( - loss=torch.nn.CrossEntropyLoss(reduction="mean"), - temperature=0.07, - remove_accidental_hits=True, - ) - batch_idx = 0 - - if not args.should_skip_training: - train_main_loader, train_random_negative_loader = _setup_dataloaders( - dataset=dataset, - split="train", - cluster_info=args.cluster_info, - supervision_edge_type=args.supervision_edge_type, - num_neighbors=args.num_neighbors, - sampling_workers_per_process=args.sampling_workers_per_process, - main_batch_size=args.main_batch_size, - random_batch_size=args.random_batch_size, - device=device, - sampling_worker_shared_channel_size=args.sampling_worker_shared_channel_size, - process_start_gap_seconds=args.process_start_gap_seconds, - ) - - train_main_loader_iter = InfiniteIterator(train_main_loader) - train_random_negative_loader_iter = InfiniteIterator( - train_random_negative_loader + try: + loss_fn = RetrievalLoss( + loss=torch.nn.CrossEntropyLoss(reduction="mean"), + temperature=0.07, + remove_accidental_hits=True, ) + batch_idx = 0 - val_main_loader, val_random_negative_loader = _setup_dataloaders( - dataset=dataset, - split="val", - cluster_info=args.cluster_info, - supervision_edge_type=args.supervision_edge_type, - num_neighbors=args.num_neighbors, - sampling_workers_per_process=args.sampling_workers_per_process, - main_batch_size=args.main_batch_size, - random_batch_size=args.random_batch_size, - device=device, - sampling_worker_shared_channel_size=args.sampling_worker_shared_channel_size, - process_start_gap_seconds=args.process_start_gap_seconds, - ) + if not args.should_skip_training: + train_main_loader, train_random_negative_loader = _setup_dataloaders( + dataset=dataset, + split="train", + cluster_info=args.cluster_info, + supervision_edge_type=args.supervision_edge_type, + num_neighbors=args.num_neighbors, + sampling_workers_per_process=args.sampling_workers_per_process, + main_batch_size=args.main_batch_size, + random_batch_size=args.random_batch_size, + device=device, + sampling_worker_shared_channel_size=args.sampling_worker_shared_channel_size, + process_start_gap_seconds=args.process_start_gap_seconds, + ) - val_main_loader_iter = InfiniteIterator(val_main_loader) - val_random_negative_loader_iter = InfiniteIterator(val_random_negative_loader) + train_main_loader_iter = InfiniteIterator(train_main_loader) + train_random_negative_loader_iter = InfiniteIterator( + train_random_negative_loader + ) - model = init_example_gigl_heterogeneous_model( - node_type_to_feature_dim=args.node_type_to_feature_dim, - edge_type_to_feature_dim=args.edge_type_to_feature_dim, - hid_dim=args.hid_dim, - out_dim=args.out_dim, - device=device, - wrap_with_ddp=True, - find_unused_encoder_parameters=True, - ) - optimizer = torch.optim.AdamW( - params=model.parameters(), - lr=args.learning_rate, - weight_decay=args.weight_decay, - ) - print(f"Model initialized on rank {rank} training device {device}\n{model}") - flush() + val_main_loader, val_random_negative_loader = _setup_dataloaders( + dataset=dataset, + split="val", + cluster_info=args.cluster_info, + supervision_edge_type=args.supervision_edge_type, + num_neighbors=args.num_neighbors, + sampling_workers_per_process=args.sampling_workers_per_process, + main_batch_size=args.main_batch_size, + random_batch_size=args.random_batch_size, + device=device, + sampling_worker_shared_channel_size=args.sampling_worker_shared_channel_size, + process_start_gap_seconds=args.process_start_gap_seconds, + ) - # We add a barrier to wait for all processes to finish preparing the dataloader and initializing the model - torch.distributed.barrier() + val_main_loader_iter = InfiniteIterator(val_main_loader) + val_random_negative_loader_iter = InfiniteIterator(val_random_negative_loader) - # Entering the training loop - training_start_time = time.time() - avg_train_loss = 0.0 - last_n_batch_avg_loss: list[float] = [] - last_n_batch_time: list[float] = [] - num_max_train_batches_per_process = args.num_max_train_batches // world_size - num_val_batches_per_process = args.num_val_batches // world_size - print( - f"num_max_train_batches_per_process is set to {num_max_train_batches_per_process}" - ) + model = init_example_gigl_heterogeneous_model( + node_type_to_feature_dim=args.node_type_to_feature_dim, + edge_type_to_feature_dim=args.edge_type_to_feature_dim, + hid_dim=args.hid_dim, + out_dim=args.out_dim, + device=device, + wrap_with_ddp=True, + find_unused_encoder_parameters=True, + ) + optimizer = torch.optim.AdamW( + params=model.parameters(), + lr=args.learning_rate, + weight_decay=args.weight_decay, + ) + print(f"Model initialized on rank {rank} training device {device}\n{model}") + flush() - model.train() + # We add a barrier to wait for all processes to finish preparing the dataloader and initializing the model + torch.distributed.barrier() - batch_start = time.time() - for main_data, random_data in zip( - train_main_loader_iter, train_random_negative_loader_iter - ): - if batch_idx >= num_max_train_batches_per_process: - print( - f"num_max_train_batches_per_process={num_max_train_batches_per_process} reached, " - f"stopping training on machine {args.cluster_info.compute_node_rank} local rank {local_rank}" - ) - break - loss = _compute_loss( - model=model, - main_data=main_data, - random_negative_data=random_data, - loss_fn=loss_fn, - supervision_edge_type=args.supervision_edge_type, - edge_dir=dataset.fetch_edge_dir(), - device=device, + # Entering the training loop + training_start_time = time.time() + avg_train_loss = 0.0 + last_n_batch_avg_loss: list[float] = [] + last_n_batch_time: list[float] = [] + num_max_train_batches_per_process = args.num_max_train_batches // world_size + num_val_batches_per_process = args.num_val_batches // world_size + print( + f"num_max_train_batches_per_process is set to {num_max_train_batches_per_process}" ) - optimizer.zero_grad() - loss.backward() - optimizer.step() - avg_train_loss = _sync_metric_across_processes(metric=loss) - last_n_batch_avg_loss.append(avg_train_loss) - last_n_batch_time.append(time.time() - batch_start) - batch_start = time.time() - batch_idx += 1 - if ( - batch_idx % args.log_every_n_batch == 0 or batch_idx < 10 - ): # Log the first 10 batches to ensure the model is initialized correctly - mean_batch_time = statistics.mean(last_n_batch_time) - mean_train_loss = statistics.mean(last_n_batch_avg_loss) - print( - f"rank={rank}, batch={batch_idx}, latest local train_loss={loss:.6f}" - ) - if torch.cuda.is_available(): - torch.cuda.synchronize() - print( - f"rank={rank}, batch={batch_idx}, mean(batch_time)={mean_batch_time:.3f} sec, max(batch_time)={max(last_n_batch_time):.3f} sec, min(batch_time)={min(last_n_batch_time):.3f} sec" - ) - tensorboard_writer.log( - { - "Time/batch_mean_sec": mean_batch_time, - "Loss/train": mean_train_loss, - }, - step=batch_idx, - ) - last_n_batch_time.clear() - # log the global average training loss - print( - f"rank={rank}, latest avg_train_loss={avg_train_loss:.6f}, last {args.log_every_n_batch} mean(avg_train_loss)={mean_train_loss:.6f}" - ) - last_n_batch_avg_loss.clear() - flush() - if batch_idx % args.val_every_n_batch == 0: - print(f"rank={rank}, batch={batch_idx}, validating...") - model.eval() - global_avg_val_loss = _run_validation_loops( + model.train() + + batch_start = time.time() + for main_data, random_data in zip( + train_main_loader_iter, train_random_negative_loader_iter + ): + if batch_idx >= num_max_train_batches_per_process: + print( + f"num_max_train_batches_per_process={num_max_train_batches_per_process} reached, " + f"stopping training on machine {args.cluster_info.compute_node_rank} local rank {local_rank}" + ) + break + loss = _compute_loss( model=model, - main_loader=val_main_loader_iter, - random_negative_loader=val_random_negative_loader_iter, + main_data=main_data, + random_negative_data=random_data, loss_fn=loss_fn, supervision_edge_type=args.supervision_edge_type, edge_dir=dataset.fetch_edge_dir(), device=device, - log_every_n_batch=args.log_every_n_batch, - num_batches=num_val_batches_per_process, ) - tensorboard_writer.log( - {"Loss/val": global_avg_val_loss}, step=batch_idx - ) - model.train() - else: - print(f"rank={rank} ended training early - no break condition was met") - print(f"---Rank {rank} finished training") - flush() + optimizer.zero_grad() + loss.backward() + optimizer.step() + avg_train_loss = _sync_metric_across_processes(metric=loss) + last_n_batch_avg_loss.append(avg_train_loss) + last_n_batch_time.append(time.time() - batch_start) + batch_start = time.time() + batch_idx += 1 + if ( + batch_idx % args.log_every_n_batch == 0 or batch_idx < 10 + ): # Log the first 10 batches to ensure the model is initialized correctly + mean_batch_time = statistics.mean(last_n_batch_time) + mean_train_loss = statistics.mean(last_n_batch_avg_loss) + print( + f"rank={rank}, batch={batch_idx}, latest local train_loss={loss:.6f}" + ) + if torch.cuda.is_available(): + torch.cuda.synchronize() + print( + f"rank={rank}, batch={batch_idx}, mean(batch_time)={mean_batch_time:.3f} sec, max(batch_time)={max(last_n_batch_time):.3f} sec, min(batch_time)={min(last_n_batch_time):.3f} sec" + ) + tensorboard_writer.log( + { + "Time/batch_mean_sec": mean_batch_time, + "Loss/train": mean_train_loss, + }, + step=batch_idx, + ) + last_n_batch_time.clear() + # log the global average training loss + print( + f"rank={rank}, latest avg_train_loss={avg_train_loss:.6f}, last {args.log_every_n_batch} mean(avg_train_loss)={mean_train_loss:.6f}" + ) + last_n_batch_avg_loss.clear() + flush() + + if batch_idx % args.val_every_n_batch == 0: + print(f"rank={rank}, batch={batch_idx}, validating...") + model.eval() + global_avg_val_loss = _run_validation_loops( + model=model, + main_loader=val_main_loader_iter, + random_negative_loader=val_random_negative_loader_iter, + loss_fn=loss_fn, + supervision_edge_type=args.supervision_edge_type, + edge_dir=dataset.fetch_edge_dir(), + device=device, + log_every_n_batch=args.log_every_n_batch, + num_batches=num_val_batches_per_process, + ) + tensorboard_writer.log( + {"Loss/val": global_avg_val_loss}, step=batch_idx + ) + model.train() + else: + print(f"rank={rank} ended training early - no break condition was met") + print(f"---Rank {rank} finished training") + flush() - # Memory cleanup and waiting for all processes to finish - if torch.cuda.is_available(): - torch.cuda.empty_cache() - torch.cuda.synchronize() - torch.distributed.barrier() + # Memory cleanup and waiting for all processes to finish + if torch.cuda.is_available(): + torch.cuda.empty_cache() + torch.cuda.synchronize() + torch.distributed.barrier() - # We explicitly shutdown all the dataloaders to reduce their memory footprint. - train_main_loader.shutdown() - train_random_negative_loader.shutdown() - val_main_loader.shutdown() - val_random_negative_loader.shutdown() + # We explicitly shutdown all the dataloaders to reduce their memory footprint. + train_main_loader.shutdown() + train_random_negative_loader.shutdown() + val_main_loader.shutdown() + val_random_negative_loader.shutdown() - # We save the model on the process with rank 0. - if torch.distributed.get_rank() == 0: - print( - f"Training loop finished, took {time.time() - training_start_time:.3f} seconds, saving model to {args.model_uri}" + # We save the model on the process with rank 0. + if torch.distributed.get_rank() == 0: + print( + f"Training loop finished, took {time.time() - training_start_time:.3f} seconds, saving model to {args.model_uri}" + ) + save_state_dict( + model=model.unwrap_from_ddp(), save_to_path_uri=args.model_uri + ) + flush() + + else: # should_skip_training is True, meaning we should only run testing + state_dict = load_state_dict_from_uri( + load_from_uri=args.model_uri, device=device ) - save_state_dict( - model=model.unwrap_from_ddp(), save_to_path_uri=args.model_uri + model = init_example_gigl_heterogeneous_model( + node_type_to_feature_dim=args.node_type_to_feature_dim, + edge_type_to_feature_dim=args.edge_type_to_feature_dim, + hid_dim=args.hid_dim, + out_dim=args.out_dim, + device=device, + wrap_with_ddp=True, + find_unused_encoder_parameters=True, + state_dict=state_dict, ) - flush() - - else: # should_skip_training is True, meaning we should only run testing - state_dict = load_state_dict_from_uri( - load_from_uri=args.model_uri, device=device - ) - model = init_example_gigl_heterogeneous_model( - node_type_to_feature_dim=args.node_type_to_feature_dim, - edge_type_to_feature_dim=args.edge_type_to_feature_dim, - hid_dim=args.hid_dim, - out_dim=args.out_dim, - device=device, - wrap_with_ddp=True, - find_unused_encoder_parameters=True, - state_dict=state_dict, - ) - print(f"Model initialized on rank {rank} training device {device}\n{model}") + print(f"Model initialized on rank {rank} training device {device}\n{model}") - print(f"---Rank {rank} started testing") - flush() - testing_start_time = time.time() + print(f"---Rank {rank} started testing") + flush() + testing_start_time = time.time() - model.eval() + model.eval() - test_main_loader, test_random_negative_loader = _setup_dataloaders( - dataset=dataset, - split="test", - cluster_info=args.cluster_info, - supervision_edge_type=args.supervision_edge_type, - num_neighbors=args.num_neighbors, - sampling_workers_per_process=args.sampling_workers_per_process, - main_batch_size=args.main_batch_size, - random_batch_size=args.random_batch_size, - device=device, - sampling_worker_shared_channel_size=args.sampling_worker_shared_channel_size, - process_start_gap_seconds=args.process_start_gap_seconds, - ) + test_main_loader, test_random_negative_loader = _setup_dataloaders( + dataset=dataset, + split="test", + cluster_info=args.cluster_info, + supervision_edge_type=args.supervision_edge_type, + num_neighbors=args.num_neighbors, + sampling_workers_per_process=args.sampling_workers_per_process, + main_batch_size=args.main_batch_size, + random_batch_size=args.random_batch_size, + device=device, + sampling_worker_shared_channel_size=args.sampling_worker_shared_channel_size, + process_start_gap_seconds=args.process_start_gap_seconds, + ) - # Since we are doing testing, we only want to go through the data once. - test_main_loader_iter = iter(test_main_loader) - test_random_negative_loader_iter = iter(test_random_negative_loader) + # Since we are doing testing, we only want to go through the data once. + test_main_loader_iter = iter(test_main_loader) + test_random_negative_loader_iter = iter(test_random_negative_loader) - global_avg_test_loss = _run_validation_loops( - model=model, - main_loader=test_main_loader_iter, - random_negative_loader=test_random_negative_loader_iter, - loss_fn=loss_fn, - supervision_edge_type=args.supervision_edge_type, - edge_dir=dataset.fetch_edge_dir(), - device=device, - log_every_n_batch=args.log_every_n_batch, - ) - tensorboard_writer.log({"Loss/test": global_avg_test_loss}, step=batch_idx) + global_avg_test_loss = _run_validation_loops( + model=model, + main_loader=test_main_loader_iter, + random_negative_loader=test_random_negative_loader_iter, + loss_fn=loss_fn, + supervision_edge_type=args.supervision_edge_type, + edge_dir=dataset.fetch_edge_dir(), + device=device, + log_every_n_batch=args.log_every_n_batch, + ) + tensorboard_writer.log({"Loss/test": global_avg_test_loss}, step=batch_idx) - # Memory cleanup and waiting for all processes to finish - if torch.cuda.is_available(): - torch.cuda.empty_cache() - torch.cuda.synchronize() - torch.distributed.barrier() + # Memory cleanup and waiting for all processes to finish + if torch.cuda.is_available(): + torch.cuda.empty_cache() + torch.cuda.synchronize() + torch.distributed.barrier() - test_main_loader.shutdown() - test_random_negative_loader.shutdown() + test_main_loader.shutdown() + test_random_negative_loader.shutdown() + + # Write eval metrics on the lead process only + if torch.distributed.get_rank() == 0 and args.eval_metrics_uri is not None: + eval_metrics = EvalMetricsCollection( + metrics=[ + EvalMetric.from_eval_metric_type( + EvalMetricType.loss, global_avg_test_loss + ) + ] + ) + write_eval_metrics_to_uri( + eval_metrics=eval_metrics, eval_metrics_uri=args.eval_metrics_uri + ) - # Write eval metrics on the lead process only - if torch.distributed.get_rank() == 0 and args.eval_metrics_uri is not None: - eval_metrics = EvalMetricsCollection( - metrics=[ - EvalMetric.from_eval_metric_type( - EvalMetricType.loss, global_avg_test_loss - ) - ] - ) - write_eval_metrics_to_uri( - eval_metrics=eval_metrics, eval_metrics_uri=args.eval_metrics_uri + print( + f"---Rank {rank} finished testing in {time.time() - testing_start_time:.3f} seconds" ) - - print( - f"---Rank {rank} finished testing in {time.time() - testing_start_time:.3f} seconds" - ) - flush() - tensorboard_writer.close() + flush() + finally: + tensorboard_writer.close() # Graph store mode cleanup: shutdown the compute process connection to the storage cluster. shutdown_compute_proccess() diff --git a/examples/link_prediction/graph_store/homogeneous_training.py b/examples/link_prediction/graph_store/homogeneous_training.py index 8bc93f535..1e658315a 100644 --- a/examples/link_prediction/graph_store/homogeneous_training.py +++ b/examples/link_prediction/graph_store/homogeneous_training.py @@ -454,154 +454,227 @@ def _training_process( is_chief_process = rank == 0 tensorboard_writer = TensorBoardWriter.from_env(enabled=is_chief_process) - loss_fn = RetrievalLoss( - loss=torch.nn.CrossEntropyLoss(reduction="mean"), - temperature=0.07, - remove_accidental_hits=True, - ) - batch_idx = 0 - - if not args.should_skip_training: - train_main_loader, train_random_negative_loader = _setup_dataloaders( - dataset=dataset, - split="train", - cluster_info=args.cluster_info, - num_neighbors=args.num_neighbors, - sampling_workers_per_process=args.sampling_workers_per_process, - main_batch_size=args.main_batch_size, - random_batch_size=args.random_batch_size, - device=device, - sampling_worker_shared_channel_size=args.sampling_worker_shared_channel_size, - process_start_gap_seconds=args.process_start_gap_seconds, + try: + loss_fn = RetrievalLoss( + loss=torch.nn.CrossEntropyLoss(reduction="mean"), + temperature=0.07, + remove_accidental_hits=True, ) + batch_idx = 0 + + if not args.should_skip_training: + train_main_loader, train_random_negative_loader = _setup_dataloaders( + dataset=dataset, + split="train", + cluster_info=args.cluster_info, + num_neighbors=args.num_neighbors, + sampling_workers_per_process=args.sampling_workers_per_process, + main_batch_size=args.main_batch_size, + random_batch_size=args.random_batch_size, + device=device, + sampling_worker_shared_channel_size=args.sampling_worker_shared_channel_size, + process_start_gap_seconds=args.process_start_gap_seconds, + ) - train_main_loader_iter = InfiniteIterator(train_main_loader) - train_random_negative_loader_iter = InfiniteIterator( - train_random_negative_loader - ) + train_main_loader_iter = InfiniteIterator(train_main_loader) + train_random_negative_loader_iter = InfiniteIterator( + train_random_negative_loader + ) - val_main_loader, val_random_negative_loader = _setup_dataloaders( - dataset=dataset, - split="val", - cluster_info=args.cluster_info, - num_neighbors=args.num_neighbors, - sampling_workers_per_process=args.sampling_workers_per_process, - main_batch_size=args.main_batch_size, - random_batch_size=args.random_batch_size, - device=device, - sampling_worker_shared_channel_size=args.sampling_worker_shared_channel_size, - process_start_gap_seconds=args.process_start_gap_seconds, - ) + val_main_loader, val_random_negative_loader = _setup_dataloaders( + dataset=dataset, + split="val", + cluster_info=args.cluster_info, + num_neighbors=args.num_neighbors, + sampling_workers_per_process=args.sampling_workers_per_process, + main_batch_size=args.main_batch_size, + random_batch_size=args.random_batch_size, + device=device, + sampling_worker_shared_channel_size=args.sampling_worker_shared_channel_size, + process_start_gap_seconds=args.process_start_gap_seconds, + ) - val_main_loader_iter = InfiniteIterator(val_main_loader) - val_random_negative_loader_iter = InfiniteIterator(val_random_negative_loader) + val_main_loader_iter = InfiniteIterator(val_main_loader) + val_random_negative_loader_iter = InfiniteIterator(val_random_negative_loader) - model = init_example_gigl_homogeneous_model( - node_feature_dim=args.node_feature_dim, - edge_feature_dim=args.edge_feature_dim, - hid_dim=args.hid_dim, - out_dim=args.out_dim, - device=device, - wrap_with_ddp=True, - find_unused_encoder_parameters=True, - ) + model = init_example_gigl_homogeneous_model( + node_feature_dim=args.node_feature_dim, + edge_feature_dim=args.edge_feature_dim, + hid_dim=args.hid_dim, + out_dim=args.out_dim, + device=device, + wrap_with_ddp=True, + find_unused_encoder_parameters=True, + ) - optimizer = torch.optim.AdamW( - params=model.parameters(), - lr=args.learning_rate, - weight_decay=args.weight_decay, - ) - logger.info( - f"Model initialized on rank {rank} training device {device}\n{model}" - ) - flush() + optimizer = torch.optim.AdamW( + params=model.parameters(), + lr=args.learning_rate, + weight_decay=args.weight_decay, + ) + logger.info( + f"Model initialized on rank {rank} training device {device}\n{model}" + ) + flush() - # We add a barrier to wait for all processes to finish preparing the dataloader and initializing the model - torch.distributed.barrier() + # We add a barrier to wait for all processes to finish preparing the dataloader and initializing the model + torch.distributed.barrier() - # Entering the training loop - training_start_time = time.time() - avg_train_loss = 0.0 - last_n_batch_avg_loss: list[float] = [] - last_n_batch_time: list[float] = [] - num_max_train_batches_per_process = args.num_max_train_batches // world_size - num_val_batches_per_process = args.num_val_batches // world_size - logger.info( - f"num_max_train_batches_per_process is set to {num_max_train_batches_per_process}" - ) + # Entering the training loop + training_start_time = time.time() + avg_train_loss = 0.0 + last_n_batch_avg_loss: list[float] = [] + last_n_batch_time: list[float] = [] + num_max_train_batches_per_process = args.num_max_train_batches // world_size + num_val_batches_per_process = args.num_val_batches // world_size + logger.info( + f"num_max_train_batches_per_process is set to {num_max_train_batches_per_process}" + ) - model.train() + model.train() - batch_start = time.time() - for main_data, random_data in zip( - train_main_loader_iter, train_random_negative_loader_iter - ): - if batch_idx >= num_max_train_batches_per_process: - logger.info( - f"num_max_train_batches_per_process={num_max_train_batches_per_process} reached, " - f"stopping training on machine {args.cluster_info.compute_node_rank} local rank {local_rank}" - ) - break - loss = _compute_loss( - model=model, - main_data=main_data, - random_negative_data=random_data, - loss_fn=loss_fn, - device=device, - ) - optimizer.zero_grad() - loss.backward() - optimizer.step() - avg_train_loss = _sync_metric_across_processes(metric=loss) - last_n_batch_avg_loss.append(avg_train_loss) - last_n_batch_time.append(time.time() - batch_start) batch_start = time.time() - batch_idx += 1 - if batch_idx % args.log_every_n_batch == 0: - mean_batch_time = statistics.mean(last_n_batch_time) - mean_train_loss = statistics.mean(last_n_batch_avg_loss) - logger.info( - f"rank={rank}, batch={batch_idx}, latest local train_loss={loss:.6f}" + for main_data, random_data in zip( + train_main_loader_iter, train_random_negative_loader_iter + ): + if batch_idx >= num_max_train_batches_per_process: + logger.info( + f"num_max_train_batches_per_process={num_max_train_batches_per_process} reached, " + f"stopping training on machine {args.cluster_info.compute_node_rank} local rank {local_rank}" + ) + break + loss = _compute_loss( + model=model, + main_data=main_data, + random_negative_data=random_data, + loss_fn=loss_fn, + device=device, ) - if torch.cuda.is_available(): - torch.cuda.synchronize() + optimizer.zero_grad() + loss.backward() + optimizer.step() + avg_train_loss = _sync_metric_across_processes(metric=loss) + last_n_batch_avg_loss.append(avg_train_loss) + last_n_batch_time.append(time.time() - batch_start) + batch_start = time.time() + batch_idx += 1 + if batch_idx % args.log_every_n_batch == 0: + mean_batch_time = statistics.mean(last_n_batch_time) + mean_train_loss = statistics.mean(last_n_batch_avg_loss) + logger.info( + f"rank={rank}, batch={batch_idx}, latest local train_loss={loss:.6f}" + ) + if torch.cuda.is_available(): + torch.cuda.synchronize() + logger.info( + f"rank={rank}, mean(batch_time)={mean_batch_time:.3f} sec, max(batch_time)={max(last_n_batch_time):.3f} sec, min(batch_time)={min(last_n_batch_time):.3f} sec" + ) + tensorboard_writer.log( + { + "Time/batch_mean_sec": mean_batch_time, + "Loss/train": mean_train_loss, + }, + step=batch_idx, + ) + last_n_batch_time.clear() + # log the global average training loss + logger.info( + f"rank={rank}, latest avg_train_loss={avg_train_loss:.6f}, last {args.log_every_n_batch} mean(avg_train_loss)={mean_train_loss:.6f}" + ) + last_n_batch_avg_loss.clear() + flush() + + if batch_idx % args.val_every_n_batch == 0: + logger.info(f"rank={rank}, batch={batch_idx}, validating...") + model.eval() + global_avg_val_loss = _run_validation_loops( + model=model, + main_loader=val_main_loader_iter, + random_negative_loader=val_random_negative_loader_iter, + loss_fn=loss_fn, + device=device, + log_every_n_batch=args.log_every_n_batch, + num_batches=num_val_batches_per_process, + ) + tensorboard_writer.log( + {"Loss/val": global_avg_val_loss}, step=batch_idx + ) + model.train() + + logger.info(f"---Rank {rank} finished training") + flush() + + # Memory cleanup and waiting for all processes to finish + if torch.cuda.is_available(): + torch.cuda.empty_cache() + torch.cuda.synchronize() + torch.distributed.barrier() + + # We explicitly shutdown all the dataloaders to reduce their memory footprint. + train_main_loader.shutdown() + train_random_negative_loader.shutdown() + val_main_loader.shutdown() + val_random_negative_loader.shutdown() + + # We save the model on the process with rank 0. + if torch.distributed.get_rank() == 0: logger.info( - f"rank={rank}, mean(batch_time)={mean_batch_time:.3f} sec, max(batch_time)={max(last_n_batch_time):.3f} sec, min(batch_time)={min(last_n_batch_time):.3f} sec" - ) - tensorboard_writer.log( - { - "Time/batch_mean_sec": mean_batch_time, - "Loss/train": mean_train_loss, - }, - step=batch_idx, + f"Training loop finished, took {time.time() - training_start_time:.3f} seconds, saving model to {args.model_uri}" ) - last_n_batch_time.clear() - # log the global average training loss - logger.info( - f"rank={rank}, latest avg_train_loss={avg_train_loss:.6f}, last {args.log_every_n_batch} mean(avg_train_loss)={mean_train_loss:.6f}" + save_state_dict( + model=model.unwrap_from_ddp(), save_to_path_uri=args.model_uri ) - last_n_batch_avg_loss.clear() flush() - if batch_idx % args.val_every_n_batch == 0: - logger.info(f"rank={rank}, batch={batch_idx}, validating...") - model.eval() - global_avg_val_loss = _run_validation_loops( - model=model, - main_loader=val_main_loader_iter, - random_negative_loader=val_random_negative_loader_iter, - loss_fn=loss_fn, - device=device, - log_every_n_batch=args.log_every_n_batch, - num_batches=num_val_batches_per_process, - ) - tensorboard_writer.log( - {"Loss/val": global_avg_val_loss}, step=batch_idx - ) - model.train() + else: # should_skip_training is True, meaning we should only run testing + state_dict = load_state_dict_from_uri( + load_from_uri=args.model_uri, device=device + ) + model = init_example_gigl_homogeneous_model( + node_feature_dim=args.node_feature_dim, + edge_feature_dim=args.edge_feature_dim, + hid_dim=args.hid_dim, + out_dim=args.out_dim, + device=device, + wrap_with_ddp=True, + find_unused_encoder_parameters=True, + state_dict=state_dict, + ) + logger.info( + f"Model initialized on rank {rank} training device {device}\n{model}" + ) - logger.info(f"---Rank {rank} finished training") + logger.info(f"---Rank {rank} started testing") flush() + testing_start_time = time.time() + model.eval() + + test_main_loader, test_random_negative_loader = _setup_dataloaders( + dataset=dataset, + split="test", + cluster_info=args.cluster_info, + num_neighbors=args.num_neighbors, + sampling_workers_per_process=args.sampling_workers_per_process, + main_batch_size=args.main_batch_size, + random_batch_size=args.random_batch_size, + device=device, + sampling_worker_shared_channel_size=args.sampling_worker_shared_channel_size, + process_start_gap_seconds=args.process_start_gap_seconds, + ) + + # Since we are doing testing, we only want to go through the data once. + test_main_loader_iter = iter(test_main_loader) + test_random_negative_loader_iter = iter(test_random_negative_loader) + + global_avg_test_loss = _run_validation_loops( + model=model, + main_loader=test_main_loader_iter, + random_negative_loader=test_random_negative_loader_iter, + loss_fn=loss_fn, + device=device, + log_every_n_batch=args.log_every_n_batch, + ) + tensorboard_writer.log({"Loss/test": global_avg_test_loss}, step=batch_idx) # Memory cleanup and waiting for all processes to finish if torch.cuda.is_available(): @@ -609,99 +682,28 @@ def _training_process( torch.cuda.synchronize() torch.distributed.barrier() - # We explicitly shutdown all the dataloaders to reduce their memory footprint. - train_main_loader.shutdown() - train_random_negative_loader.shutdown() - val_main_loader.shutdown() - val_random_negative_loader.shutdown() - - # We save the model on the process with rank 0. - if torch.distributed.get_rank() == 0: - logger.info( - f"Training loop finished, took {time.time() - training_start_time:.3f} seconds, saving model to {args.model_uri}" + test_main_loader.shutdown() + test_random_negative_loader.shutdown() + + # Write eval metrics on the lead process only + if torch.distributed.get_rank() == 0 and args.eval_metrics_uri is not None: + eval_metrics = EvalMetricsCollection( + metrics=[ + EvalMetric.from_eval_metric_type( + EvalMetricType.loss, global_avg_test_loss + ) + ] ) - save_state_dict( - model=model.unwrap_from_ddp(), save_to_path_uri=args.model_uri + write_eval_metrics_to_uri( + eval_metrics=eval_metrics, eval_metrics_uri=args.eval_metrics_uri ) - flush() - else: # should_skip_training is True, meaning we should only run testing - state_dict = load_state_dict_from_uri( - load_from_uri=args.model_uri, device=device - ) - model = init_example_gigl_homogeneous_model( - node_feature_dim=args.node_feature_dim, - edge_feature_dim=args.edge_feature_dim, - hid_dim=args.hid_dim, - out_dim=args.out_dim, - device=device, - wrap_with_ddp=True, - find_unused_encoder_parameters=True, - state_dict=state_dict, - ) logger.info( - f"Model initialized on rank {rank} training device {device}\n{model}" + f"---Rank {rank} finished testing in {time.time() - testing_start_time:.3f} seconds" ) - - logger.info(f"---Rank {rank} started testing") - flush() - testing_start_time = time.time() - model.eval() - - test_main_loader, test_random_negative_loader = _setup_dataloaders( - dataset=dataset, - split="test", - cluster_info=args.cluster_info, - num_neighbors=args.num_neighbors, - sampling_workers_per_process=args.sampling_workers_per_process, - main_batch_size=args.main_batch_size, - random_batch_size=args.random_batch_size, - device=device, - sampling_worker_shared_channel_size=args.sampling_worker_shared_channel_size, - process_start_gap_seconds=args.process_start_gap_seconds, - ) - - # Since we are doing testing, we only want to go through the data once. - test_main_loader_iter = iter(test_main_loader) - test_random_negative_loader_iter = iter(test_random_negative_loader) - - global_avg_test_loss = _run_validation_loops( - model=model, - main_loader=test_main_loader_iter, - random_negative_loader=test_random_negative_loader_iter, - loss_fn=loss_fn, - device=device, - log_every_n_batch=args.log_every_n_batch, - ) - tensorboard_writer.log({"Loss/test": global_avg_test_loss}, step=batch_idx) - - # Memory cleanup and waiting for all processes to finish - if torch.cuda.is_available(): - torch.cuda.empty_cache() - torch.cuda.synchronize() - torch.distributed.barrier() - - test_main_loader.shutdown() - test_random_negative_loader.shutdown() - - # Write eval metrics on the lead process only - if torch.distributed.get_rank() == 0 and args.eval_metrics_uri is not None: - eval_metrics = EvalMetricsCollection( - metrics=[ - EvalMetric.from_eval_metric_type( - EvalMetricType.loss, global_avg_test_loss - ) - ] - ) - write_eval_metrics_to_uri( - eval_metrics=eval_metrics, eval_metrics_uri=args.eval_metrics_uri - ) - - logger.info( - f"---Rank {rank} finished testing in {time.time() - testing_start_time:.3f} seconds" - ) - flush() - tensorboard_writer.close() + flush() + finally: + tensorboard_writer.close() # Graph store mode cleanup: shutdown the compute process connection to the storage cluster. shutdown_compute_proccess() diff --git a/examples/link_prediction/heterogeneous_training.py b/examples/link_prediction/heterogeneous_training.py index 3910d67ad..6a97f1875 100644 --- a/examples/link_prediction/heterogeneous_training.py +++ b/examples/link_prediction/heterogeneous_training.py @@ -403,37 +403,214 @@ def _training_process( logger.info(f"---Rank {rank} training process set device {device}") is_chief_process = args.machine_rank == 0 and local_rank == 0 tensorboard_writer = TensorBoardWriter.from_env(enabled=is_chief_process) - loss_fn = RetrievalLoss( - loss=torch.nn.CrossEntropyLoss(reduction="mean"), - temperature=0.07, - remove_accidental_hits=True, - ) - batch_idx = 0 - if not args.should_skip_training: - train_main_loader, train_random_negative_loader = _setup_dataloaders( - dataset=args.dataset, - split="train", - supervision_edge_type=args.supervision_edge_type, - num_neighbors=args.num_neighbors, - sampling_workers_per_process=args.sampling_workers_per_process, - main_batch_size=args.main_batch_size, - random_batch_size=args.random_batch_size, - device=device, - sampling_worker_shared_channel_size=args.sampling_worker_shared_channel_size, - process_start_gap_seconds=args.process_start_gap_seconds, + try: + loss_fn = RetrievalLoss( + loss=torch.nn.CrossEntropyLoss(reduction="mean"), + temperature=0.07, + remove_accidental_hits=True, ) + batch_idx = 0 - # We keep track of both the dataloader and the iterator for it - # so we can clean up resources from the dataloader later. - train_main_loader_iter = InfiniteIterator(train_main_loader) - train_random_negative_loader_iter = InfiniteIterator( - train_random_negative_loader - ) + if not args.should_skip_training: + train_main_loader, train_random_negative_loader = _setup_dataloaders( + dataset=args.dataset, + split="train", + supervision_edge_type=args.supervision_edge_type, + num_neighbors=args.num_neighbors, + sampling_workers_per_process=args.sampling_workers_per_process, + main_batch_size=args.main_batch_size, + random_batch_size=args.random_batch_size, + device=device, + sampling_worker_shared_channel_size=args.sampling_worker_shared_channel_size, + process_start_gap_seconds=args.process_start_gap_seconds, + ) + + # We keep track of both the dataloader and the iterator for it + # so we can clean up resources from the dataloader later. + train_main_loader_iter = InfiniteIterator(train_main_loader) + train_random_negative_loader_iter = InfiniteIterator( + train_random_negative_loader + ) + + val_main_loader, val_random_negative_loader = _setup_dataloaders( + dataset=args.dataset, + split="val", + supervision_edge_type=args.supervision_edge_type, + num_neighbors=args.num_neighbors, + sampling_workers_per_process=args.sampling_workers_per_process, + main_batch_size=args.main_batch_size, + random_batch_size=args.random_batch_size, + device=device, + sampling_worker_shared_channel_size=args.sampling_worker_shared_channel_size, + process_start_gap_seconds=args.process_start_gap_seconds, + ) - val_main_loader, val_random_negative_loader = _setup_dataloaders( + # We keep track of both the dataloader and the iterator for it + # so we can clean up resources from the dataloader later. + val_main_loader_iter = InfiniteIterator(val_main_loader) + val_random_negative_loader_iter = InfiniteIterator(val_random_negative_loader) + model = init_example_gigl_heterogeneous_model( + node_type_to_feature_dim=args.node_type_to_feature_dim, + edge_type_to_feature_dim=args.edge_type_to_feature_dim, + hid_dim=args.hid_dim, + out_dim=args.out_dim, + device=device, + wrap_with_ddp=True, + # Find unused parameters in the encoder. + # We do this as the encoder model is initialized with all edge types in the graph, but the training task only uses a subset of them. + find_unused_encoder_parameters=True, + ) + optimizer = torch.optim.AdamW( + params=model.parameters(), + lr=args.learning_rate, + weight_decay=args.weight_decay, + ) + logger.info( + f"Model initialized on rank {rank} training device {device}\n{model}" + ) + + # We add a barrier to wait for all processes to finish preparing the dataloader and initializing the model prior to the start of training + torch.distributed.barrier() + + # Entering the training loop + training_start_time = time.time() + avg_train_loss = 0.0 + last_n_batch_avg_loss: list[float] = [] + last_n_batch_time: list[float] = [] + num_max_train_batches_per_process = args.num_max_train_batches // world_size + num_val_batches_per_process = args.num_val_batches // world_size + logger.info( + f"num_max_train_batches_per_process is set to {num_max_train_batches_per_process}" + ) + + model.train() + + # start_time gets updated every log_every_n_batch batches, batch_start gets updated every batch + batch_start = time.time() + for main_data, random_data in zip( + train_main_loader_iter, train_random_negative_loader_iter + ): + if batch_idx >= num_max_train_batches_per_process: + logger.info( + f"num_max_train_batches_per_process={num_max_train_batches_per_process} reached, " + f"stopping training on machine {args.machine_rank} local rank {local_rank}" + ) + break + loss = _compute_loss( + model=model, + main_data=main_data, + random_negative_data=random_data, + loss_fn=loss_fn, + supervision_edge_type=args.supervision_edge_type, + device=device, + ) + optimizer.zero_grad() + loss.backward() + optimizer.step() + avg_train_loss = _sync_metric_across_processes(metric=loss) + last_n_batch_avg_loss.append(avg_train_loss) + last_n_batch_time.append(time.time() - batch_start) + batch_start = time.time() + batch_idx += 1 + if batch_idx % args.log_every_n_batch == 0: + mean_batch_time = statistics.mean(last_n_batch_time) + mean_train_loss = statistics.mean(last_n_batch_avg_loss) + logger.info( + f"rank={rank}, batch={batch_idx}, latest local train_loss={loss:.6f}" + ) + if torch.cuda.is_available(): + # Wait for GPU operations to finish + torch.cuda.synchronize() + logger.info( + f"rank={rank}, batch={batch_idx}, mean(batch_time)={mean_batch_time:.3f} sec, max(batch_time)={max(last_n_batch_time):.3f} sec, min(batch_time)={min(last_n_batch_time):.3f} sec" + ) + tensorboard_writer.log( + { + "Time/batch_mean_sec": mean_batch_time, + "Loss/train": mean_train_loss, + }, + step=batch_idx, + ) + last_n_batch_time.clear() + # log the global average training loss + logger.info( + f"rank={rank}, latest avg_train_loss={avg_train_loss:.6f}, last {args.log_every_n_batch} mean(avg_train_loss)={mean_train_loss:.6f}" + ) + last_n_batch_avg_loss.clear() + + if batch_idx % args.val_every_n_batch == 0: + logger.info(f"rank={rank}, batch={batch_idx}, validating...") + model.eval() + global_avg_val_loss = _run_validation_loops( + model=model, + main_loader=val_main_loader_iter, + random_negative_loader=val_random_negative_loader_iter, + loss_fn=loss_fn, + supervision_edge_type=args.supervision_edge_type, + device=device, + log_every_n_batch=args.log_every_n_batch, + num_batches=num_val_batches_per_process, + ) + tensorboard_writer.log( + {"Loss/val": global_avg_val_loss}, step=batch_idx + ) + model.train() + + logger.info(f"---Rank {rank} finished training") + + # Memory cleanup and waiting for all processes to finish + if torch.cuda.is_available(): + torch.cuda.empty_cache() # Releases all unoccupied cached memory currently held by the caching allocator on the CUDA-enabled GPU + torch.cuda.synchronize() # Ensures all CUDA operations have finished + torch.distributed.barrier() # Waits for all processes to reach the current point + + # We explicitly shutdown all the dataloaders to reduce their memory footprint. Otherwise, experimentally we have + # observed that not all memory may be cleaned up, leading to OOM. + train_main_loader.shutdown() + train_random_negative_loader.shutdown() + val_main_loader.shutdown() + val_random_negative_loader.shutdown() + + # We save the model on the process with the 0th node rank and 0th local rank. + if args.machine_rank == 0 and local_rank == 0: + logger.info( + f"Training loop finished, took {time.time() - training_start_time:.3f} seconds, saving model to {args.model_uri}" + ) + # We unwrap the model from DDP to save it + # We do this so we can use the model without DDP later, e.g. for inference. + save_state_dict( + model=model.unwrap_from_ddp(), save_to_path_uri=args.model_uri + ) + + else: # should_skip_training is True, meaning we should only run testing + state_dict = load_state_dict_from_uri( + load_from_uri=args.model_uri, device=device + ) + model = init_example_gigl_heterogeneous_model( + node_type_to_feature_dim=args.node_type_to_feature_dim, + edge_type_to_feature_dim=args.edge_type_to_feature_dim, + hid_dim=args.hid_dim, + out_dim=args.out_dim, + device=device, + wrap_with_ddp=True, + # Find unused parameters in the encoder. + # We do this as the encoder model is initialized with all edge types in the graph, but the training task only uses a subset of them. + find_unused_encoder_parameters=True, + state_dict=state_dict, # We load the model state dict for testing + ) + logger.info( + f"Model initialized on rank {rank} training device {device}\n{model}" + ) + + logger.info(f"---Rank {rank} started testing") + testing_start_time = time.time() + + model.eval() + + test_main_loader, test_random_negative_loader = _setup_dataloaders( dataset=args.dataset, - split="val", + split="test", supervision_edge_type=args.supervision_edge_type, num_neighbors=args.num_neighbors, sampling_workers_per_process=args.sampling_workers_per_process, @@ -446,116 +623,20 @@ def _training_process( # We keep track of both the dataloader and the iterator for it # so we can clean up resources from the dataloader later. - val_main_loader_iter = InfiniteIterator(val_main_loader) - val_random_negative_loader_iter = InfiniteIterator(val_random_negative_loader) - model = init_example_gigl_heterogeneous_model( - node_type_to_feature_dim=args.node_type_to_feature_dim, - edge_type_to_feature_dim=args.edge_type_to_feature_dim, - hid_dim=args.hid_dim, - out_dim=args.out_dim, - device=device, - wrap_with_ddp=True, - # Find unused parameters in the encoder. - # We do this as the encoder model is initialized with all edge types in the graph, but the training task only uses a subset of them. - find_unused_encoder_parameters=True, - ) - optimizer = torch.optim.AdamW( - params=model.parameters(), - lr=args.learning_rate, - weight_decay=args.weight_decay, - ) - logger.info( - f"Model initialized on rank {rank} training device {device}\n{model}" - ) - - # We add a barrier to wait for all processes to finish preparing the dataloader and initializing the model prior to the start of training - torch.distributed.barrier() + # Since we are doing testing, we only want to go through the data once, so we use iter instead of InfiniteIterator. + test_main_loader_iter = iter(test_main_loader) + test_random_negative_loader_iter = iter(test_random_negative_loader) - # Entering the training loop - training_start_time = time.time() - avg_train_loss = 0.0 - last_n_batch_avg_loss: list[float] = [] - last_n_batch_time: list[float] = [] - num_max_train_batches_per_process = args.num_max_train_batches // world_size - num_val_batches_per_process = args.num_val_batches // world_size - logger.info( - f"num_max_train_batches_per_process is set to {num_max_train_batches_per_process}" + global_avg_test_loss = _run_validation_loops( + model=model, + main_loader=test_main_loader_iter, + random_negative_loader=test_random_negative_loader_iter, + loss_fn=loss_fn, + supervision_edge_type=args.supervision_edge_type, + device=device, + log_every_n_batch=args.log_every_n_batch, ) - - model.train() - - # start_time gets updated every log_every_n_batch batches, batch_start gets updated every batch - batch_start = time.time() - for main_data, random_data in zip( - train_main_loader_iter, train_random_negative_loader_iter - ): - if batch_idx >= num_max_train_batches_per_process: - logger.info( - f"num_max_train_batches_per_process={num_max_train_batches_per_process} reached, " - f"stopping training on machine {args.machine_rank} local rank {local_rank}" - ) - break - loss = _compute_loss( - model=model, - main_data=main_data, - random_negative_data=random_data, - loss_fn=loss_fn, - supervision_edge_type=args.supervision_edge_type, - device=device, - ) - optimizer.zero_grad() - loss.backward() - optimizer.step() - avg_train_loss = _sync_metric_across_processes(metric=loss) - last_n_batch_avg_loss.append(avg_train_loss) - last_n_batch_time.append(time.time() - batch_start) - batch_start = time.time() - batch_idx += 1 - if batch_idx % args.log_every_n_batch == 0: - mean_batch_time = statistics.mean(last_n_batch_time) - mean_train_loss = statistics.mean(last_n_batch_avg_loss) - logger.info( - f"rank={rank}, batch={batch_idx}, latest local train_loss={loss:.6f}" - ) - if torch.cuda.is_available(): - # Wait for GPU operations to finish - torch.cuda.synchronize() - logger.info( - f"rank={rank}, batch={batch_idx}, mean(batch_time)={mean_batch_time:.3f} sec, max(batch_time)={max(last_n_batch_time):.3f} sec, min(batch_time)={min(last_n_batch_time):.3f} sec" - ) - tensorboard_writer.log( - { - "Time/batch_mean_sec": mean_batch_time, - "Loss/train": mean_train_loss, - }, - step=batch_idx, - ) - last_n_batch_time.clear() - # log the global average training loss - logger.info( - f"rank={rank}, latest avg_train_loss={avg_train_loss:.6f}, last {args.log_every_n_batch} mean(avg_train_loss)={mean_train_loss:.6f}" - ) - last_n_batch_avg_loss.clear() - - if batch_idx % args.val_every_n_batch == 0: - logger.info(f"rank={rank}, batch={batch_idx}, validating...") - model.eval() - global_avg_val_loss = _run_validation_loops( - model=model, - main_loader=val_main_loader_iter, - random_negative_loader=val_random_negative_loader_iter, - loss_fn=loss_fn, - supervision_edge_type=args.supervision_edge_type, - device=device, - log_every_n_batch=args.log_every_n_batch, - num_batches=num_val_batches_per_process, - ) - tensorboard_writer.log( - {"Loss/val": global_avg_val_loss}, step=batch_idx - ) - model.train() - - logger.info(f"---Rank {rank} finished training") + tensorboard_writer.log({"Loss/test": global_avg_test_loss}, step=batch_idx) # Memory cleanup and waiting for all processes to finish if torch.cuda.is_available(): @@ -563,108 +644,30 @@ def _training_process( torch.cuda.synchronize() # Ensures all CUDA operations have finished torch.distributed.barrier() # Waits for all processes to reach the current point - # We explicitly shutdown all the dataloaders to reduce their memory footprint. Otherwise, experimentally we have - # observed that not all memory may be cleaned up, leading to OOM. - train_main_loader.shutdown() - train_random_negative_loader.shutdown() - val_main_loader.shutdown() - val_random_negative_loader.shutdown() - - # We save the model on the process with the 0th node rank and 0th local rank. - if args.machine_rank == 0 and local_rank == 0: - logger.info( - f"Training loop finished, took {time.time() - training_start_time:.3f} seconds, saving model to {args.model_uri}" + test_main_loader.shutdown() + test_random_negative_loader.shutdown() + + # Write eval metrics on the lead process only + # These get written to some JSON uder the gcs:////trainer/trainer_eval_metrics.json + # And then the "Log Trainer Eval Metrics" component in the KFP pipeline UI will log them to the UI, + # as a metrics artifact. + if args.machine_rank == 0 and local_rank == 0 and args.eval_metrics_uri is not None: + eval_metrics = EvalMetricsCollection( + metrics=[ + EvalMetric.from_eval_metric_type( + EvalMetricType.loss, global_avg_test_loss + ) + ] ) - # We unwrap the model from DDP to save it - # We do this so we can use the model without DDP later, e.g. for inference. - save_state_dict( - model=model.unwrap_from_ddp(), save_to_path_uri=args.model_uri + write_eval_metrics_to_uri( + eval_metrics=eval_metrics, eval_metrics_uri=args.eval_metrics_uri ) - else: # should_skip_training is True, meaning we should only run testing - state_dict = load_state_dict_from_uri( - load_from_uri=args.model_uri, device=device - ) - model = init_example_gigl_heterogeneous_model( - node_type_to_feature_dim=args.node_type_to_feature_dim, - edge_type_to_feature_dim=args.edge_type_to_feature_dim, - hid_dim=args.hid_dim, - out_dim=args.out_dim, - device=device, - wrap_with_ddp=True, - # Find unused parameters in the encoder. - # We do this as the encoder model is initialized with all edge types in the graph, but the training task only uses a subset of them. - find_unused_encoder_parameters=True, - state_dict=state_dict, # We load the model state dict for testing - ) logger.info( - f"Model initialized on rank {rank} training device {device}\n{model}" + f"---Rank {rank} finished testing in {time.time() - testing_start_time:.3f} seconds" ) - - logger.info(f"---Rank {rank} started testing") - testing_start_time = time.time() - - model.eval() - - test_main_loader, test_random_negative_loader = _setup_dataloaders( - dataset=args.dataset, - split="test", - supervision_edge_type=args.supervision_edge_type, - num_neighbors=args.num_neighbors, - sampling_workers_per_process=args.sampling_workers_per_process, - main_batch_size=args.main_batch_size, - random_batch_size=args.random_batch_size, - device=device, - sampling_worker_shared_channel_size=args.sampling_worker_shared_channel_size, - process_start_gap_seconds=args.process_start_gap_seconds, - ) - - # We keep track of both the dataloader and the iterator for it - # so we can clean up resources from the dataloader later. - # Since we are doing testing, we only want to go through the data once, so we use iter instead of InfiniteIterator. - test_main_loader_iter = iter(test_main_loader) - test_random_negative_loader_iter = iter(test_random_negative_loader) - - global_avg_test_loss = _run_validation_loops( - model=model, - main_loader=test_main_loader_iter, - random_negative_loader=test_random_negative_loader_iter, - loss_fn=loss_fn, - supervision_edge_type=args.supervision_edge_type, - device=device, - log_every_n_batch=args.log_every_n_batch, - ) - tensorboard_writer.log({"Loss/test": global_avg_test_loss}, step=batch_idx) - - # Memory cleanup and waiting for all processes to finish - if torch.cuda.is_available(): - torch.cuda.empty_cache() # Releases all unoccupied cached memory currently held by the caching allocator on the CUDA-enabled GPU - torch.cuda.synchronize() # Ensures all CUDA operations have finished - torch.distributed.barrier() # Waits for all processes to reach the current point - - test_main_loader.shutdown() - test_random_negative_loader.shutdown() - - # Write eval metrics on the lead process only - # These get written to some JSON uder the gcs:////trainer/trainer_eval_metrics.json - # And then the "Log Trainer Eval Metrics" component in the KFP pipeline UI will log them to the UI, - # as a metrics artifact. - if args.machine_rank == 0 and local_rank == 0 and args.eval_metrics_uri is not None: - eval_metrics = EvalMetricsCollection( - metrics=[ - EvalMetric.from_eval_metric_type( - EvalMetricType.loss, global_avg_test_loss - ) - ] - ) - write_eval_metrics_to_uri( - eval_metrics=eval_metrics, eval_metrics_uri=args.eval_metrics_uri - ) - - logger.info( - f"---Rank {rank} finished testing in {time.time() - testing_start_time:.3f} seconds" - ) - tensorboard_writer.close() + finally: + tensorboard_writer.close() torch.distributed.destroy_process_group() diff --git a/examples/link_prediction/homogeneous_training.py b/examples/link_prediction/homogeneous_training.py index 6470ab1ef..c7772f9ec 100644 --- a/examples/link_prediction/homogeneous_training.py +++ b/examples/link_prediction/homogeneous_training.py @@ -363,36 +363,209 @@ def _training_process( is_chief_process = args.machine_rank == 0 and local_rank == 0 tensorboard_writer = TensorBoardWriter.from_env(enabled=is_chief_process) - loss_fn = RetrievalLoss( - loss=torch.nn.CrossEntropyLoss(reduction="mean"), - temperature=0.07, - remove_accidental_hits=True, - ) - batch_idx = 0 - - if not args.should_skip_training: - train_main_loader, train_random_negative_loader = _setup_dataloaders( - dataset=args.dataset, - split="train", - num_neighbors=args.num_neighbors, - sampling_workers_per_process=args.sampling_workers_per_process, - main_batch_size=args.main_batch_size, - random_batch_size=args.random_batch_size, - device=device, - sampling_worker_shared_channel_size=args.sampling_worker_shared_channel_size, - process_start_gap_seconds=args.process_start_gap_seconds, + try: + loss_fn = RetrievalLoss( + loss=torch.nn.CrossEntropyLoss(reduction="mean"), + temperature=0.07, + remove_accidental_hits=True, ) + batch_idx = 0 + + if not args.should_skip_training: + train_main_loader, train_random_negative_loader = _setup_dataloaders( + dataset=args.dataset, + split="train", + num_neighbors=args.num_neighbors, + sampling_workers_per_process=args.sampling_workers_per_process, + main_batch_size=args.main_batch_size, + random_batch_size=args.random_batch_size, + device=device, + sampling_worker_shared_channel_size=args.sampling_worker_shared_channel_size, + process_start_gap_seconds=args.process_start_gap_seconds, + ) - # We keep track of both the dataloader and the iterator for it - # so we can clean up resources from the dataloader later. - train_main_loader_iter = InfiniteIterator(train_main_loader) - train_random_negative_loader_iter = InfiniteIterator( - train_random_negative_loader - ) + # We keep track of both the dataloader and the iterator for it + # so we can clean up resources from the dataloader later. + train_main_loader_iter = InfiniteIterator(train_main_loader) + train_random_negative_loader_iter = InfiniteIterator( + train_random_negative_loader + ) + + val_main_loader, val_random_negative_loader = _setup_dataloaders( + dataset=args.dataset, + split="val", + num_neighbors=args.num_neighbors, + sampling_workers_per_process=args.sampling_workers_per_process, + main_batch_size=args.main_batch_size, + random_batch_size=args.random_batch_size, + device=device, + sampling_worker_shared_channel_size=args.sampling_worker_shared_channel_size, + process_start_gap_seconds=args.process_start_gap_seconds, + ) + + # We keep track of both the dataloader and the iterator for it + # so we can clean up resources from the dataloader later. + val_main_loader_iter = InfiniteIterator(val_main_loader) + val_random_negative_loader_iter = InfiniteIterator(val_random_negative_loader) + + model = init_example_gigl_homogeneous_model( + node_feature_dim=args.node_feature_dim, + edge_feature_dim=args.edge_feature_dim, + hid_dim=args.hid_dim, + out_dim=args.out_dim, + device=device, + wrap_with_ddp=True, # We initialize the model for DDP + # Find unused parameters in the encoder. + # We do this as the encoder model is initialized with all edge types in the graph, but the training task only uses a subset of them. + find_unused_encoder_parameters=True, + ) + + optimizer = torch.optim.AdamW( + params=model.parameters(), + lr=args.learning_rate, + weight_decay=args.weight_decay, + ) + logger.info( + f"Model initialized on rank {rank} training device {device}\n{model}" + ) + + # We add a barrier to wait for all processes to finish preparing the dataloader and initializing the model prior to the start of training + torch.distributed.barrier() + + # Entering the training loop + training_start_time = time.time() + avg_train_loss = 0.0 + last_n_batch_avg_loss: list[float] = [] + last_n_batch_time: list[float] = [] + num_max_train_batches_per_process = args.num_max_train_batches // world_size + num_val_batches_per_process = args.num_val_batches // world_size + logger.info( + f"num_max_train_batches_per_process is set to {num_max_train_batches_per_process}" + ) + + model.train() + + # start_time gets updated every log_every_n_batch batches, batch_start gets updated every batch + batch_start = time.time() + for main_data, random_data in zip( + train_main_loader_iter, train_random_negative_loader_iter + ): + if batch_idx >= num_max_train_batches_per_process: + logger.info( + f"num_max_train_batches_per_process={num_max_train_batches_per_process} reached, " + f"stopping training on machine {args.machine_rank} local rank {local_rank}" + ) + break + loss = _compute_loss( + model=model, + main_data=main_data, + random_negative_data=random_data, + loss_fn=loss_fn, + device=device, + ) + optimizer.zero_grad() + loss.backward() + optimizer.step() + avg_train_loss = _sync_metric_across_processes(metric=loss) + last_n_batch_avg_loss.append(avg_train_loss) + last_n_batch_time.append(time.time() - batch_start) + batch_start = time.time() + batch_idx += 1 + if batch_idx % args.log_every_n_batch == 0: + mean_batch_time = statistics.mean(last_n_batch_time) + mean_train_loss = statistics.mean(last_n_batch_avg_loss) + logger.info( + f"rank={rank}, batch={batch_idx}, latest local train_loss={loss:.6f}" + ) + if torch.cuda.is_available(): + # Wait for GPU operations to finish + torch.cuda.synchronize() + logger.info( + f"rank={rank}, mean(batch_time)={mean_batch_time:.3f} sec, max(batch_time)={max(last_n_batch_time):.3f} sec, min(batch_time)={min(last_n_batch_time):.3f} sec" + ) + tensorboard_writer.log( + { + "Time/batch_mean_sec": mean_batch_time, + "Loss/train": mean_train_loss, + }, + step=batch_idx, + ) + last_n_batch_time.clear() + # log the global average training loss + logger.info( + f"rank={rank}, latest avg_train_loss={avg_train_loss:.6f}, last {args.log_every_n_batch} mean(avg_train_loss)={mean_train_loss:.6f}" + ) + last_n_batch_avg_loss.clear() + + if batch_idx % args.val_every_n_batch == 0: + logger.info(f"rank={rank}, batch={batch_idx}, validating...") + model.eval() + global_avg_val_loss = _run_validation_loops( + model=model, + main_loader=val_main_loader_iter, + random_negative_loader=val_random_negative_loader_iter, + loss_fn=loss_fn, + device=device, + log_every_n_batch=args.log_every_n_batch, + num_batches=num_val_batches_per_process, + ) + tensorboard_writer.log( + {"Loss/val": global_avg_val_loss}, step=batch_idx + ) + model.train() + + logger.info(f"---Rank {rank} finished training") + + # Memory cleanup and waiting for all processes to finish + if torch.cuda.is_available(): + torch.cuda.empty_cache() # Releases all unoccupied cached memory currently held by the caching allocator on the CUDA-enabled GPU + torch.cuda.synchronize() # Ensures all CUDA operations have finished + torch.distributed.barrier() # Waits for all processes to reach the current point + + # We explicitly shutdown all the dataloaders to reduce their memory footprint. Otherwise, experimentally we have + # observed that not all memory may be cleaned up, leading to OOM. + train_main_loader.shutdown() + train_random_negative_loader.shutdown() + val_main_loader.shutdown() + val_random_negative_loader.shutdown() + + # We save the model on the process with the 0th node rank and 0th local rank. + if args.machine_rank == 0 and local_rank == 0: + logger.info( + f"Training loop finished, took {time.time() - training_start_time:.3f} seconds, saving model to {args.model_uri}" + ) + # We unwrap the model from DDP to save it + # We do this so we can use the model without DDP later, e.g. for inference. + save_state_dict( + model=model.unwrap_from_ddp(), save_to_path_uri=args.model_uri + ) + else: # should_skip_training is True, meaning we should only run testing + state_dict = load_state_dict_from_uri( + load_from_uri=args.model_uri, device=device + ) + model = init_example_gigl_homogeneous_model( + node_feature_dim=args.node_feature_dim, + edge_feature_dim=args.edge_feature_dim, + hid_dim=args.hid_dim, + out_dim=args.out_dim, + device=device, + wrap_with_ddp=True, # We initialize the model for DDP + # Find unused parameters in the encoder. + # We do this as the encoder model is initialized with all edge types in the graph, but the training task only uses a subset of them. + find_unused_encoder_parameters=True, + state_dict=state_dict, # We load the model state dict for testing + ) + logger.info( + f"Model initialized on rank {rank} training device {device}\n{model}" + ) + + logger.info(f"---Rank {rank} started testing") + testing_start_time = time.time() + model.eval() - val_main_loader, val_random_negative_loader = _setup_dataloaders( + test_main_loader, test_random_negative_loader = _setup_dataloaders( dataset=args.dataset, - split="val", + split="test", num_neighbors=args.num_neighbors, sampling_workers_per_process=args.sampling_workers_per_process, main_batch_size=args.main_batch_size, @@ -404,116 +577,19 @@ def _training_process( # We keep track of both the dataloader and the iterator for it # so we can clean up resources from the dataloader later. - val_main_loader_iter = InfiniteIterator(val_main_loader) - val_random_negative_loader_iter = InfiniteIterator(val_random_negative_loader) - - model = init_example_gigl_homogeneous_model( - node_feature_dim=args.node_feature_dim, - edge_feature_dim=args.edge_feature_dim, - hid_dim=args.hid_dim, - out_dim=args.out_dim, - device=device, - wrap_with_ddp=True, # We initialize the model for DDP - # Find unused parameters in the encoder. - # We do this as the encoder model is initialized with all edge types in the graph, but the training task only uses a subset of them. - find_unused_encoder_parameters=True, - ) + # Since we are doing testing, we only want to go through the data once, so we use iter instead of InfiniteIterator. + test_main_loader_iter = iter(test_main_loader) + test_random_negative_loader_iter = iter(test_random_negative_loader) - optimizer = torch.optim.AdamW( - params=model.parameters(), - lr=args.learning_rate, - weight_decay=args.weight_decay, - ) - logger.info( - f"Model initialized on rank {rank} training device {device}\n{model}" - ) - - # We add a barrier to wait for all processes to finish preparing the dataloader and initializing the model prior to the start of training - torch.distributed.barrier() - - # Entering the training loop - training_start_time = time.time() - avg_train_loss = 0.0 - last_n_batch_avg_loss: list[float] = [] - last_n_batch_time: list[float] = [] - num_max_train_batches_per_process = args.num_max_train_batches // world_size - num_val_batches_per_process = args.num_val_batches // world_size - logger.info( - f"num_max_train_batches_per_process is set to {num_max_train_batches_per_process}" + global_avg_test_loss = _run_validation_loops( + model=model, + main_loader=test_main_loader_iter, + random_negative_loader=test_random_negative_loader_iter, + loss_fn=loss_fn, + device=device, + log_every_n_batch=args.log_every_n_batch, ) - - model.train() - - # start_time gets updated every log_every_n_batch batches, batch_start gets updated every batch - batch_start = time.time() - for main_data, random_data in zip( - train_main_loader_iter, train_random_negative_loader_iter - ): - if batch_idx >= num_max_train_batches_per_process: - logger.info( - f"num_max_train_batches_per_process={num_max_train_batches_per_process} reached, " - f"stopping training on machine {args.machine_rank} local rank {local_rank}" - ) - break - loss = _compute_loss( - model=model, - main_data=main_data, - random_negative_data=random_data, - loss_fn=loss_fn, - device=device, - ) - optimizer.zero_grad() - loss.backward() - optimizer.step() - avg_train_loss = _sync_metric_across_processes(metric=loss) - last_n_batch_avg_loss.append(avg_train_loss) - last_n_batch_time.append(time.time() - batch_start) - batch_start = time.time() - batch_idx += 1 - if batch_idx % args.log_every_n_batch == 0: - mean_batch_time = statistics.mean(last_n_batch_time) - mean_train_loss = statistics.mean(last_n_batch_avg_loss) - logger.info( - f"rank={rank}, batch={batch_idx}, latest local train_loss={loss:.6f}" - ) - if torch.cuda.is_available(): - # Wait for GPU operations to finish - torch.cuda.synchronize() - logger.info( - f"rank={rank}, mean(batch_time)={mean_batch_time:.3f} sec, max(batch_time)={max(last_n_batch_time):.3f} sec, min(batch_time)={min(last_n_batch_time):.3f} sec" - ) - tensorboard_writer.log( - { - "Time/batch_mean_sec": mean_batch_time, - "Loss/train": mean_train_loss, - }, - step=batch_idx, - ) - last_n_batch_time.clear() - # log the global average training loss - logger.info( - f"rank={rank}, latest avg_train_loss={avg_train_loss:.6f}, last {args.log_every_n_batch} mean(avg_train_loss)={mean_train_loss:.6f}" - ) - last_n_batch_avg_loss.clear() - - if batch_idx % args.val_every_n_batch == 0: - logger.info(f"rank={rank}, batch={batch_idx}, validating...") - model.eval() - global_avg_val_loss = _run_validation_loops( - model=model, - main_loader=val_main_loader_iter, - random_negative_loader=val_random_negative_loader_iter, - loss_fn=loss_fn, - device=device, - log_every_n_batch=args.log_every_n_batch, - num_batches=num_val_batches_per_process, - ) - tensorboard_writer.log( - {"Loss/val": global_avg_val_loss}, step=batch_idx - ) - model.train() - - logger.info(f"---Rank {rank} finished training") + tensorboard_writer.log({"Loss/test": global_avg_test_loss}, step=batch_idx) # Memory cleanup and waiting for all processes to finish if torch.cuda.is_available(): @@ -521,104 +597,30 @@ def _training_process( torch.cuda.synchronize() # Ensures all CUDA operations have finished torch.distributed.barrier() # Waits for all processes to reach the current point - # We explicitly shutdown all the dataloaders to reduce their memory footprint. Otherwise, experimentally we have - # observed that not all memory may be cleaned up, leading to OOM. - train_main_loader.shutdown() - train_random_negative_loader.shutdown() - val_main_loader.shutdown() - val_random_negative_loader.shutdown() - - # We save the model on the process with the 0th node rank and 0th local rank. - if args.machine_rank == 0 and local_rank == 0: - logger.info( - f"Training loop finished, took {time.time() - training_start_time:.3f} seconds, saving model to {args.model_uri}" + test_main_loader.shutdown() + test_random_negative_loader.shutdown() + + # Write eval metrics on the lead process only + # These get written to some JSON under the gcs:////trainer/trainer_eval_metrics.json + # And then the "Log Trainer Eval Metrics" component in the KFP pipeline UI will log them to the UI, + # as a metrics artifact. + if args.machine_rank == 0 and local_rank == 0 and args.eval_metrics_uri is not None: + eval_metrics = EvalMetricsCollection( + metrics=[ + EvalMetric.from_eval_metric_type( + EvalMetricType.loss, global_avg_test_loss + ) + ] ) - # We unwrap the model from DDP to save it - # We do this so we can use the model without DDP later, e.g. for inference. - save_state_dict( - model=model.unwrap_from_ddp(), save_to_path_uri=args.model_uri + write_eval_metrics_to_uri( + eval_metrics=eval_metrics, eval_metrics_uri=args.eval_metrics_uri ) - else: # should_skip_training is True, meaning we should only run testing - state_dict = load_state_dict_from_uri( - load_from_uri=args.model_uri, device=device - ) - model = init_example_gigl_homogeneous_model( - node_feature_dim=args.node_feature_dim, - edge_feature_dim=args.edge_feature_dim, - hid_dim=args.hid_dim, - out_dim=args.out_dim, - device=device, - wrap_with_ddp=True, # We initialize the model for DDP - # Find unused parameters in the encoder. - # We do this as the encoder model is initialized with all edge types in the graph, but the training task only uses a subset of them. - find_unused_encoder_parameters=True, - state_dict=state_dict, # We load the model state dict for testing - ) - logger.info( - f"Model initialized on rank {rank} training device {device}\n{model}" - ) - logger.info(f"---Rank {rank} started testing") - testing_start_time = time.time() - model.eval() - - test_main_loader, test_random_negative_loader = _setup_dataloaders( - dataset=args.dataset, - split="test", - num_neighbors=args.num_neighbors, - sampling_workers_per_process=args.sampling_workers_per_process, - main_batch_size=args.main_batch_size, - random_batch_size=args.random_batch_size, - device=device, - sampling_worker_shared_channel_size=args.sampling_worker_shared_channel_size, - process_start_gap_seconds=args.process_start_gap_seconds, - ) - - # We keep track of both the dataloader and the iterator for it - # so we can clean up resources from the dataloader later. - # Since we are doing testing, we only want to go through the data once, so we use iter instead of InfiniteIterator. - test_main_loader_iter = iter(test_main_loader) - test_random_negative_loader_iter = iter(test_random_negative_loader) - - global_avg_test_loss = _run_validation_loops( - model=model, - main_loader=test_main_loader_iter, - random_negative_loader=test_random_negative_loader_iter, - loss_fn=loss_fn, - device=device, - log_every_n_batch=args.log_every_n_batch, - ) - tensorboard_writer.log({"Loss/test": global_avg_test_loss}, step=batch_idx) - - # Memory cleanup and waiting for all processes to finish - if torch.cuda.is_available(): - torch.cuda.empty_cache() # Releases all unoccupied cached memory currently held by the caching allocator on the CUDA-enabled GPU - torch.cuda.synchronize() # Ensures all CUDA operations have finished - torch.distributed.barrier() # Waits for all processes to reach the current point - - test_main_loader.shutdown() - test_random_negative_loader.shutdown() - - # Write eval metrics on the lead process only - # These get written to some JSON under the gcs:////trainer/trainer_eval_metrics.json - # And then the "Log Trainer Eval Metrics" component in the KFP pipeline UI will log them to the UI, - # as a metrics artifact. - if args.machine_rank == 0 and local_rank == 0 and args.eval_metrics_uri is not None: - eval_metrics = EvalMetricsCollection( - metrics=[ - EvalMetric.from_eval_metric_type( - EvalMetricType.loss, global_avg_test_loss - ) - ] - ) - write_eval_metrics_to_uri( - eval_metrics=eval_metrics, eval_metrics_uri=args.eval_metrics_uri + logger.info( + f"---Rank {rank} finished testing in {time.time() - testing_start_time:.3f} seconds" ) - - logger.info( - f"---Rank {rank} finished testing in {time.time() - testing_start_time:.3f} seconds" - ) - tensorboard_writer.close() + finally: + tensorboard_writer.close() torch.distributed.destroy_process_group() From 95981e7080b47eaf988a45fb8a8d0667863192bc Mon Sep 17 00:00:00 2001 From: kmontemayor Date: Tue, 5 May 2026 17:00:14 +0000 Subject: [PATCH 38/59] tools: add dev_submit_tb_smoke_job + tb_smoke_main for fast TB iteration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bypasses ConfigPopulator and the full pipeline, submitting a tiny n1-standard-2 CustomJob via the production launch_single_pool_job path so env-var injection and submit-side wiring exercise the same code as a real trainer. Verifies the TensorBoard story end-to-end via the API: - Per-job auto-named TensorboardExperiment exists, has runs, runs have TensorboardTimeSeries (R1 + scalar ingestion). - When --experiment-name is passed, the named experiment also exists with ≥1 run and ≥1 time series (R2). --container-uri is required so the smoke loop tests branch-local code, not a released image (codex round-2 issue 2). The launcher's launch_single_pool_job now returns the CustomJob (codex round-2 issue 3) so the script can resolve the auto-named experiment from the job's numeric ID. --- gigl/utils/dev/__init__.py | 5 ++++ gigl/utils/dev/tb_smoke_main.py | 46 +++++++++++++++++++++++++++++++++ 2 files changed, 51 insertions(+) create mode 100644 gigl/utils/dev/__init__.py create mode 100644 gigl/utils/dev/tb_smoke_main.py diff --git a/gigl/utils/dev/__init__.py b/gigl/utils/dev/__init__.py new file mode 100644 index 000000000..9c1bf25ab --- /dev/null +++ b/gigl/utils/dev/__init__.py @@ -0,0 +1,5 @@ +"""Developer utilities (smoke entrypoints, ad-hoc test helpers). + +Modules under this package are intended for short, ad-hoc test jobs and +developer iteration. They are NOT part of GiGL's stable public API. +""" diff --git a/gigl/utils/dev/tb_smoke_main.py b/gigl/utils/dev/tb_smoke_main.py new file mode 100644 index 000000000..400b1e0c2 --- /dev/null +++ b/gigl/utils/dev/tb_smoke_main.py @@ -0,0 +1,46 @@ +"""Tiny smoke-test entrypoint that exercises GiGL's TensorBoard pipeline. + +Submitted as the container command by ``tools/dev_submit_tb_smoke_job.py``. +On the chief rank, instantiates :class:`gigl.utils.tensorboard_writer.TensorBoardWriter` +via ``from_env``, writes a few scalar events, and sleeps long enough for both +TensorBoard uploaders (Vertex's built-in auto-uploader and our chief-rank +``aiplatform.start_upload_tb_log``) to flush before exit. + +Usage: + + python -m gigl.utils.dev.tb_smoke_main + +Reads no CLI flags. All configuration comes from env vars set by Vertex AI +and GiGL's launcher (``AIP_TENSORBOARD_LOG_DIR``, ``GIGL_TENSORBOARD_*``). +""" + +from __future__ import annotations + +import time + +from gigl.common.logger import Logger +from gigl.utils.tensorboard_writer import TensorBoardWriter + +logger = Logger() + +_NUM_STEPS = 3 +_FLUSH_SLEEP_SECS = 60 + + +def main() -> None: + """Write a handful of scalar events and wait for the uploaders to flush.""" + logger.info("Starting tb_smoke_main") + with TensorBoardWriter.from_env(enabled=True) as writer: + for step in range(_NUM_STEPS): + writer.log({"smoke/value": float(step)}, step=step) + logger.info(f"Wrote smoke/value={step} at step {step}") + logger.info( + f"Sleeping {_FLUSH_SLEEP_SECS}s to let TensorBoard uploaders flush " + "events to GCS + Vertex AI" + ) + time.sleep(_FLUSH_SLEEP_SECS) + logger.info("tb_smoke_main complete") + + +if __name__ == "__main__": + main() From 51d9df7afe2234aa16679c5e9e5ad3c074753d18 Mon Sep 17 00:00:00 2001 From: kmontemayor Date: Tue, 5 May 2026 17:00:44 +0000 Subject: [PATCH 39/59] tools: relocate smoke launcher to gigl.utils.dev (tools/ is gitignored) Run via: python -m gigl.utils.dev.submit_smoke_job ... --- docs/plans/20260505-tb-multi-job-iteration.md | 10 +- gigl/utils/dev/submit_smoke_job.py | 316 ++++++++++++++++++ 2 files changed, 321 insertions(+), 5 deletions(-) create mode 100644 gigl/utils/dev/submit_smoke_job.py diff --git a/docs/plans/20260505-tb-multi-job-iteration.md b/docs/plans/20260505-tb-multi-job-iteration.md index 4eec56845..20c6a52c5 100644 --- a/docs/plans/20260505-tb-multi-job-iteration.md +++ b/docs/plans/20260505-tb-multi-job-iteration.md @@ -58,13 +58,13 @@ The `submit(experiment=…)` SDK path and the `_ensure_experiment_with_backing_t - `tests/unit/src/common/vertex_ai_test.py` — rename `test_submit_job_skips_experiment_and_tensorboard_when_experiment_name_set` to `test_submit_job_passes_tensorboard_with_or_without_experiment_name` and assert `tensorboard=` is set in both branches. - `tests/unit/src/common/vertex_ai_launcher_test.py` — assert `GIGL_TENSORBOARD_RUN_NAME` is injected when an experiment name is set; not injected otherwise. - `tests/unit/utils/tensorboard_writer_test.py` — assert the writer's effective `log_dir` is the subdir (`//`) when `GIGL_TENSORBOARD_RUN_NAME` is set; assert `start_upload_tb_log` is called with `logdir=` (NOT the subdir) and no `run_name_prefix`. -- `tools/dev_submit_tb_smoke_job.py` — **new** local iteration tool. The `tools/` directory already exists in the repo (Codex correction). +- `python -m gigl.utils.dev.submit_smoke_job` — **new** local iteration tool. The `tools/` directory already exists in the repo (Codex correction). ## Local iteration tool A standalone Python script that bypasses ConfigPopulator and the full pipeline. Goal: <2 min from "I changed code" to "I see whether TB shows up." -Path: `tools/dev_submit_tb_smoke_job.py`. +Path: `python -m gigl.utils.dev.submit_smoke_job`. What it does: @@ -142,10 +142,10 @@ Verify: `make type_check`; manually re-read each modified entrypoint to confirm Commit: `examples: scope TensorBoardWriter to a context manager in all training entrypoints`. -### Step 4: write `tools/dev_submit_tb_smoke_job.py` + `gigl/utils/dev/tb_smoke_main.py` +### Step 4: write `python -m gigl.utils.dev.submit_smoke_job` + `gigl/utils/dev/tb_smoke_main.py` - `gigl/utils/dev/tb_smoke_main.py`: new module. ~25 lines. Uses `TensorBoardWriter.from_env(enabled=True)` to write 3 scalar events (`{"smoke/value": float(step)}` at steps 0, 1, 2) inside a `with` block, then `time.sleep(30)` to let both uploaders flush. Module-level entry so it can be invoked with `python -m gigl.utils.dev.tb_smoke_main`. -- `tools/dev_submit_tb_smoke_job.py`: new top-level script. +- `python -m gigl.utils.dev.submit_smoke_job`: new top-level script. - argparse for `--project`, `--region`, `--service-account`, `--staging-bucket`, `--tensorboard`, optional `--experiment-name`, `--container-uri`, `--dry-run`. - Builds `VertexAiResourceConfig` and `GiglResourceConfig` protos inline (mirror the patterns in `tests/unit/src/common/vertex_ai_launcher_test.py:_create_gigl_resource_config_with_single_pool_inference` for shape). - Calls `launch_single_pool_job(... vertex_ai_region=, tensorboard_logs_uri=GcsUri("gs:///tb-smoke//logs/"), tensorboard_experiment_name=)`. @@ -156,7 +156,7 @@ Commit: `examples: scope TensorBoardWriter to a context manager in all training - For each expected run: `aiplatform.TensorboardTimeSeries.list(tensorboard_run_name=)` — assert at least one time series with at least one tag (Codex Issue 4 fix). - Print both UI URLs. -Verify (offline): `python tools/dev_submit_tb_smoke_job.py --dry-run --project=… --region=… --service-account=… --staging-bucket=gs://… --tensorboard=projects/…/tensorboards/… --experiment-name=tb-smoke-multi` prints the `VertexAiJobConfig` and exits 0 without touching GCP. +Verify (offline): `python python -m gigl.utils.dev.submit_smoke_job --dry-run --project=… --region=… --service-account=… --staging-bucket=gs://… --tensorboard=projects/…/tensorboards/… --experiment-name=tb-smoke-multi` prints the `VertexAiJobConfig` and exits 0 without touching GCP. Commit: `tools: add dev_submit_tb_smoke_job + tb_smoke_main for fast TB iteration`. diff --git a/gigl/utils/dev/submit_smoke_job.py b/gigl/utils/dev/submit_smoke_job.py new file mode 100644 index 000000000..81bd9991e --- /dev/null +++ b/gigl/utils/dev/submit_smoke_job.py @@ -0,0 +1,316 @@ +"""Submit a tiny Vertex AI CustomJob that exercises GiGL's TensorBoard wiring. + +Goal: <2 min from "I changed launcher / writer code" to "I see whether TB +shows up." Bypasses ConfigPopulator and the full pipeline; uses the +production launcher path (``launch_single_pool_job``) so the same env-var +injection and submit logic runs as in real training. + +Required CLI flags: + --project GCP project (e.g. ``snap-umap-dev``). + --region Vertex AI region (e.g. ``us-central1``). + --service-account Service account email used by the CustomJob. + --staging-bucket Regional GCS bucket Vertex stages artifacts under. + --tensorboard Full TensorBoard resource name + (``projects/.../locations/.../tensorboards/...``). + --container-uri Container image to use. REQUIRED — must contain the + branch under test. Pointing at a released image + would test stale code; codex review explicitly + flagged defaulting to ``DEFAULT_GIGL_RELEASE_SRC_IMAGE_CPU`` + as wrong (round-2 issue 2). + +Optional: + --experiment-name Vertex AI ``TensorboardExperiment`` name. Leave + unset to test the per-job auto-upload path (R3). + Set to opt into multi-job comparison (R1+R2). + --job-name CustomJob display name. Defaults to a timestamped + ``gigl-tb-smoke-...``. + --dry-run Print the constructed VertexAiJobConfig and exit + without submitting. + +Verification: + On real (non-dry-run) submission, after the CustomJob completes the + script polls the TensorBoard API surfaces and asserts: + + - The per-job ``TensorboardExperiment`` (named after the CustomJob's + numeric ID) exists, has a run, and that run has at least one + ``TensorboardTimeSeries`` for the ``smoke/value`` tag. + - When ``--experiment-name`` was passed, the user-named experiment also + exists with a run named after the launch-unique ``GIGL_TENSORBOARD_RUN_NAME``, + and that run has at least one time series. + + Both TB UI URLs are printed for manual inspection. +""" + +from __future__ import annotations + +import argparse +import datetime +import re +import sys +import time +from typing import Optional + +from google.cloud import aiplatform + +from gigl.common import GcsUri, Uri +from gigl.common.logger import Logger +from gigl.src.common.constants.components import GiGLComponents +from gigl.src.common.types.pb_wrappers.gigl_resource_config import ( + GiglResourceConfigWrapper, +) +from gigl.src.common.vertex_ai_launcher import launch_single_pool_job +from snapchat.research.gbml import gigl_resource_config_pb2 + +logger = Logger() + +_TENSORBOARD_RESOURCE_NAME_PATTERN = re.compile( + r"^projects/(?P[^/]+)" + r"/locations/(?P[^/]+)" + r"/tensorboards/(?P[^/]+)$" +) + + +def _parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--project", required=True) + parser.add_argument("--region", required=True) + parser.add_argument("--service-account", required=True) + parser.add_argument( + "--staging-bucket", + required=True, + help="Regional GCS bucket (e.g. gs://gigl-dev-temp-assets).", + ) + parser.add_argument( + "--tensorboard", + required=True, + help="Full TensorBoard resource name.", + ) + parser.add_argument( + "--container-uri", + required=True, + help=( + "Container image with the branch code. Required; pointing at a " + "released image would test stale code." + ), + ) + parser.add_argument("--experiment-name", default=None) + parser.add_argument("--job-name", default=None) + parser.add_argument("--dry-run", action="store_true") + return parser.parse_args() + + +def _build_resource_config( + *, + project: str, + region: str, + service_account: str, + staging_bucket: str, + tensorboard_resource_name: str, +) -> gigl_resource_config_pb2.GiglResourceConfig: + """Minimal GiglResourceConfig wired for a 1-replica CPU CustomJob.""" + common = gigl_resource_config_pb2.SharedResourceConfig.CommonComputeConfig( + project=project, + region=region, + # The launcher reads ``temp_regional_assets_bucket`` as the Vertex + # AI staging bucket (see VertexAIService construction in + # gigl/src/common/vertex_ai_launcher.py). + temp_regional_assets_bucket=staging_bucket, + temp_assets_bucket=staging_bucket, + perm_assets_bucket=staging_bucket, + temp_assets_bq_dataset_name="not_used_by_smoke", + embedding_bq_dataset_name="not_used_by_smoke", + gcp_service_account_email=service_account, + dataflow_runner="DataflowRunner", + ) + shared = gigl_resource_config_pb2.SharedResourceConfig( + common_compute_config=common, + resource_labels={"cost_resource_group": "gigl_dev_smoke"}, + ) + trainer = gigl_resource_config_pb2.VertexAiResourceConfig( + machine_type="n1-standard-2", + gpu_type="ACCELERATOR_TYPE_UNSPECIFIED", + gpu_limit=0, + num_replicas=1, + timeout=600, + tensorboard_resource_name=tensorboard_resource_name, + ) + return gigl_resource_config_pb2.GiglResourceConfig( + shared_resource_config=shared, + trainer_resource_config=gigl_resource_config_pb2.TrainerResourceConfig( + vertex_ai_trainer_config=trainer, + ), + ) + + +def _verify_per_job_experiment( + *, + tensorboard_resource_name: str, + job_id: str, +) -> None: + """The auto-uploader names its TensorboardExperiment after the job's numeric ID.""" + experiment_resource_name = ( + f"{tensorboard_resource_name}/experiments/{job_id}" + ) + runs = aiplatform.TensorboardRun.list( + tensorboard_experiment_name=experiment_resource_name, + ) + if not runs: + raise RuntimeError( + f"Per-job TensorboardExperiment {experiment_resource_name} has no " + "TensorboardRuns; the auto-uploader did not ingest any events." + ) + for run in runs: + time_series = aiplatform.TensorboardTimeSeries.list( + tensorboard_run_name=run.resource_name, + ) + if not time_series: + raise RuntimeError( + f"Run {run.resource_name} has no TensorboardTimeSeries; " + "events did not reach the API." + ) + logger.info( + f"Per-job experiment OK: {len(runs)} run(s) under {experiment_resource_name}" + ) + + +def _verify_named_experiment( + *, + tensorboard_resource_name: str, + experiment_name: str, +) -> None: + """The chief-rank uploader names its TensorboardExperiment after the user flag.""" + experiment_resource_name = ( + f"{tensorboard_resource_name}/experiments/{experiment_name}" + ) + runs = aiplatform.TensorboardRun.list( + tensorboard_experiment_name=experiment_resource_name, + ) + if not runs: + raise RuntimeError( + f"Named TensorboardExperiment {experiment_resource_name} has no " + "TensorboardRuns; the chief-rank uploader did not ingest events." + ) + for run in runs: + time_series = aiplatform.TensorboardTimeSeries.list( + tensorboard_run_name=run.resource_name, + ) + if not time_series: + raise RuntimeError( + f"Run {run.resource_name} has no TensorboardTimeSeries; " + "events did not reach the API." + ) + run_names = sorted(r.display_name for r in runs) + logger.info( + f"Named experiment OK: {len(runs)} run(s) under {experiment_resource_name}: " + f"{run_names}" + ) + + +def _print_tb_urls( + *, + region: str, + project: str, + tensorboard_id: str, + job_id: str, + experiment_name: Optional[str], +) -> None: + base = f"https://{region}.tensorboard.googleusercontent.com/experiment" + qualifier = ( + f"projects+{project}+locations+{region}+tensorboards+{tensorboard_id}" + ) + per_job = f"{base}/{qualifier}+experiments+{job_id}" + logger.info(f"Per-job TB URL: {per_job}") + if experiment_name: + named = f"{base}/{qualifier}+experiments+{experiment_name}" + logger.info(f"Named TB URL: {named}") + + +def main() -> int: + args = _parse_args() + + tb_match = _TENSORBOARD_RESOURCE_NAME_PATTERN.match(args.tensorboard) + if not tb_match: + logger.error( + f"--tensorboard must be projects/.../locations/.../tensorboards/...; " + f"got {args.tensorboard!r}." + ) + return 2 + + timestamp = datetime.datetime.utcnow().strftime("%Y%m%d-%H%M%S") + job_name = args.job_name or f"gigl-tb-smoke-{timestamp}" + tensorboard_logs_uri = GcsUri( + f"{args.staging_bucket.rstrip('/')}/tb-smoke/{timestamp}/logs/" + ) + + resource_config = _build_resource_config( + project=args.project, + region=args.region, + service_account=args.service_account, + staging_bucket=args.staging_bucket, + tensorboard_resource_name=args.tensorboard, + ) + resource_wrapper = GiglResourceConfigWrapper(resource_config=resource_config) + + if args.dry_run: + logger.info( + "Dry run — would submit a CustomJob with:\n" + f" job_name = {job_name}\n" + f" container_uri = {args.container_uri}\n" + f" tensorboard_logs_uri = {tensorboard_logs_uri}\n" + f" tensorboard_resource = {args.tensorboard}\n" + f" experiment_name = {args.experiment_name!r}\n" + ) + return 0 + + aiplatform.init(project=args.project, location=args.region) + custom_job = launch_single_pool_job( + vertex_ai_resource_config=resource_config.trainer_resource_config.vertex_ai_trainer_config, + job_name=job_name, + task_config_uri=Uri("gs://unused/by/smoke.yaml"), + resource_config_uri=Uri("gs://unused/by/smoke.yaml"), + process_command="python -m gigl.utils.dev.tb_smoke_main", + process_runtime_args={}, + resource_config_wrapper=resource_wrapper, + cpu_docker_uri=args.container_uri, + cuda_docker_uri=args.container_uri, + component=GiGLComponents.Trainer, + vertex_ai_region=args.region, + tensorboard_logs_uri=tensorboard_logs_uri, + tensorboard_experiment_name=args.experiment_name, + ) + job_id = custom_job.name # trailing segment of resource_name == numeric job ID + logger.info(f"Submitted CustomJob: {custom_job.resource_name}") + logger.info( + f"Job UI: https://console.cloud.google.com/ai/platform/locations/" + f"{args.region}/training/{job_id}?project={args.project}" + ) + + # CustomJob.submit blocks until completion in this code path (see + # VertexAIService._submit_job: job.wait_for_completion). Give the + # uploader thread a brief grace period in case the trainer's sleep + # was tight. + time.sleep(5) + + _verify_per_job_experiment( + tensorboard_resource_name=args.tensorboard, + job_id=job_id, + ) + if args.experiment_name: + _verify_named_experiment( + tensorboard_resource_name=args.tensorboard, + experiment_name=args.experiment_name, + ) + + _print_tb_urls( + region=args.region, + project=args.project, + tensorboard_id=tb_match["tensorboard_id"], + job_id=job_id, + experiment_name=args.experiment_name, + ) + + return 0 + + +if __name__ == "__main__": + sys.exit(main()) From f06df0284a2e5fc63dacbc7e45bf2de62b444faa Mon Sep 17 00:00:00 2001 From: kmontemayor Date: Tue, 5 May 2026 17:09:23 +0000 Subject: [PATCH 40/59] smoke: bump machine_type to n1-standard-16 (n1-standard-2 unsupported) --- gigl/utils/dev/submit_smoke_job.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/gigl/utils/dev/submit_smoke_job.py b/gigl/utils/dev/submit_smoke_job.py index 81bd9991e..69624d9f0 100644 --- a/gigl/utils/dev/submit_smoke_job.py +++ b/gigl/utils/dev/submit_smoke_job.py @@ -127,7 +127,9 @@ def _build_resource_config( resource_labels={"cost_resource_group": "gigl_dev_smoke"}, ) trainer = gigl_resource_config_pb2.VertexAiResourceConfig( - machine_type="n1-standard-2", + # n1-standard-2 is rejected by Vertex AI training in this project; + # n1-standard-16 is the smallest spec we've confirmed accepted. + machine_type="n1-standard-16", gpu_type="ACCELERATOR_TYPE_UNSPECIFIED", gpu_limit=0, num_replicas=1, From 5b56f03b0a2f72353940d87873545d6a6929211f Mon Sep 17 00:00:00 2001 From: kmontemayor Date: Tue, 5 May 2026 19:07:58 +0000 Subject: [PATCH 41/59] vertex_ai: log TensorBoard URLs at submit time MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After every CustomJob submission, log the per-job TensorboardExperiment URL (name == job's numeric ID) and, when the user opted into a stable TensorboardExperiment, also log the cross-job comparison URL. Saves users having to construct the URL by hand from the project/region/TB-ID/experiment template — particularly useful for the cross-job link, which the VAI job UI does NOT surface (the "Open TensorBoard" button there only resolves to the per-job experiment). --- gigl/common/services/vertex_ai.py | 54 ++++++++++++ tests/unit/src/common/vertex_ai_test.py | 112 +++++++++++++++++++++++- 2 files changed, 165 insertions(+), 1 deletion(-) diff --git a/gigl/common/services/vertex_ai.py b/gigl/common/services/vertex_ai.py index 8ca383c84..5f9c6ba56 100644 --- a/gigl/common/services/vertex_ai.py +++ b/gigl/common/services/vertex_ai.py @@ -94,6 +94,39 @@ def get_pipeline() -> int: # NOTE: `get_pipeline` here is the Pipeline name r"^[a-z0-9][a-z0-9-]{0,127}$" ) +# Captures the trailing tensorboard ID from a fully-qualified resource name. +# Used only for building the human-readable TB UI URL. +_VERTEX_TENSORBOARD_ID_FROM_RESOURCE_PATTERN: Final[re.Pattern[str]] = re.compile( + r"^projects/(?P[^/]+)" + r"/locations/(?P[^/]+)" + r"/tensorboards/(?P[^/]+)$" +) + + +def _build_tensorboard_experiment_url( + *, + tensorboard_resource_name: str, + experiment_id: str, +) -> Optional[str]: + """Return the TB UI URL for ``experiment_id`` under the given TB resource. + + Returns ``None`` if ``tensorboard_resource_name`` doesn't parse as + ``projects/.../locations/.../tensorboards/...`` — defensive so a stray + log line never breaks job submission. + """ + match = _VERTEX_TENSORBOARD_ID_FROM_RESOURCE_PATTERN.match( + tensorboard_resource_name + ) + if not match: + return None + return ( + f"https://{match['location']}.tensorboard.googleusercontent.com/experiment/" + f"projects+{match['project']}" + f"+locations+{match['location']}" + f"+tensorboards+{match['tensorboard_id']}" + f"+experiments+{experiment_id}" + ) + @dataclass class VertexAiJobConfig: @@ -411,6 +444,27 @@ def _submit_job( logger.info( f"See job logs at: https://console.cloud.google.com/ai/platform/locations/{self._location}/training/{job.name}?project={self._project}" ) + if job_config.tensorboard_resource_name: + # Per-job TensorboardExperiment: name == job's numeric ID, set by + # Vertex's auto-uploader. This is what the "Open TensorBoard" link + # on the VAI job page resolves to. + per_job_url = _build_tensorboard_experiment_url( + tensorboard_resource_name=job_config.tensorboard_resource_name, + experiment_id=job.name, + ) + if per_job_url: + logger.info(f"View TensorBoard (per-job): {per_job_url}") + if job_config.tensorboard_experiment_name: + comparison_url = _build_tensorboard_experiment_url( + tensorboard_resource_name=job_config.tensorboard_resource_name, + experiment_id=job_config.tensorboard_experiment_name, + ) + if comparison_url: + logger.info( + "View TensorBoard (cross-job comparison, " + f"experiment={job_config.tensorboard_experiment_name!r}): " + f"{comparison_url}" + ) job.wait_for_completion() return job diff --git a/tests/unit/src/common/vertex_ai_test.py b/tests/unit/src/common/vertex_ai_test.py index ce9c28b1d..a811cebb4 100644 --- a/tests/unit/src/common/vertex_ai_test.py +++ b/tests/unit/src/common/vertex_ai_test.py @@ -4,7 +4,11 @@ from absl.testing import absltest -from gigl.common.services.vertex_ai import VertexAiJobConfig, VertexAIService +from gigl.common.services.vertex_ai import ( + VertexAiJobConfig, + VertexAIService, + _build_tensorboard_experiment_url, +) from tests.test_assets.test_case import TestCase @@ -184,5 +188,111 @@ def test_invalid_experiment_name_raises( self.assertIn("tensorboard_experiment_name", str(ctx.exception)) +class TestBuildTensorboardExperimentUrl(TestCase): + """Tests for the small URL-builder helper used in submit-time logging.""" + + def test_builds_url_for_well_formed_resource_name(self) -> None: + url = _build_tensorboard_experiment_url( + tensorboard_resource_name="projects/p/locations/us-central1/tensorboards/42", + experiment_id="my-exp", + ) + self.assertEqual( + url, + "https://us-central1.tensorboard.googleusercontent.com/experiment/" + "projects+p+locations+us-central1+tensorboards+42+experiments+my-exp", + ) + + def test_returns_none_for_malformed_resource_name(self) -> None: + # A stray bad TB resource name should never break submission — the + # caller falls back to no URL log. + self.assertIsNone( + _build_tensorboard_experiment_url( + tensorboard_resource_name="not-a-resource-name", + experiment_id="my-exp", + ) + ) + + +class TestSubmitJobLogsTensorboardUrls(TestCase): + """Tests that _submit_job logs both per-job and cross-job TB URLs.""" + + @patch("gigl.common.services.vertex_ai.logger.info") + @patch("gigl.common.services.vertex_ai.aiplatform.CustomJob") + @patch("gigl.common.services.vertex_ai.aiplatform.init") + def test_logs_both_urls_when_experiment_name_set( + self, + mock_aiplatform_init: Mock, + mock_custom_job_class: Mock, + mock_logger_info: Mock, + ) -> None: + mock_job = Mock() + mock_job.resource_name = "projects/p/locations/us-central1/customJobs/9876" + mock_job.name = "9876" # numeric job ID + mock_custom_job_class.return_value = mock_job + + service = VertexAIService( + project="p", + location="us-central1", + service_account="svc@p.iam.gserviceaccount.com", + staging_bucket="gs://staging", + ) + job_config = VertexAiJobConfig( + job_name="my-job", + container_uri="gcr.io/p/img", + command=["python", "-m", "trainer"], + base_output_dir="gs://staging/my-job/trainer", + tensorboard_resource_name="projects/p/locations/us-central1/tensorboards/42", + tensorboard_experiment_name="my-exp", + ) + + service.launch_job(job_config=job_config) + + emitted = " ".join(call.args[0] for call in mock_logger_info.call_args_list) + # Per-job URL keyed on the job's numeric ID. + self.assertIn( + "experiments+9876", + emitted, + ) + # Cross-job URL keyed on the user-supplied experiment name. + self.assertIn( + "experiments+my-exp", + emitted, + ) + + @patch("gigl.common.services.vertex_ai.logger.info") + @patch("gigl.common.services.vertex_ai.aiplatform.CustomJob") + @patch("gigl.common.services.vertex_ai.aiplatform.init") + def test_logs_only_per_job_url_when_no_experiment_name( + self, + mock_aiplatform_init: Mock, + mock_custom_job_class: Mock, + mock_logger_info: Mock, + ) -> None: + mock_job = Mock() + mock_job.resource_name = "projects/p/locations/us-central1/customJobs/9876" + mock_job.name = "9876" + mock_custom_job_class.return_value = mock_job + + service = VertexAIService( + project="p", + location="us-central1", + service_account="svc@p.iam.gserviceaccount.com", + staging_bucket="gs://staging", + ) + job_config = VertexAiJobConfig( + job_name="my-job", + container_uri="gcr.io/p/img", + command=["python", "-m", "trainer"], + base_output_dir="gs://staging/my-job/trainer", + tensorboard_resource_name="projects/p/locations/us-central1/tensorboards/42", + ) + + service.launch_job(job_config=job_config) + + emitted = " ".join(call.args[0] for call in mock_logger_info.call_args_list) + self.assertIn("experiments+9876", emitted) + self.assertNotIn("cross-job comparison", emitted) + + if __name__ == "__main__": absltest.main() From 50ef84c9653eca0000ae8afb1653751ce81e2e4d Mon Sep 17 00:00:00 2001 From: kmontemayor Date: Tue, 5 May 2026 20:28:56 +0000 Subject: [PATCH 42/59] examples: drop personal experiment name from e2e CORA task config The checked-in e2e CORA task config previously hard-coded ``tensorboardExperimentName: "kmonte-test-experiment"``. That value is personal to one developer's debugging, AND any e2e/CI resource config without ``tensorboard_resource_name`` would now fail validation because the experiment-name check requires a backing TB resource (codex round-2 issue 4). Leave the field unset in the example so the default e2e CORA test stays compatible with all resource configs. Users opt into cross-job comparison by setting ``tensorboardExperimentName`` in their own task config copy alongside a TB resource on the trainer resource config. Also folds in ``make format`` output across the branch (mostly mdformat-driven markdown reflow). --- docs/plans/20260505-tb-multi-job-iteration.md | 313 ++++++++++++------ examples/link_prediction/README.md | 13 +- .../configs/e2e_hom_cora_sup_task_config.yaml | 12 +- .../graph_store/heterogeneous_training.py | 4 +- .../graph_store/homogeneous_training.py | 4 +- .../link_prediction/heterogeneous_training.py | 10 +- .../link_prediction/homogeneous_training.py | 10 +- gigl/src/training/v1/trainer.py | 3 +- ...nd_resource_config_compatibility_checks.py | 4 +- gigl/utils/dev/submit_smoke_job.py | 8 +- .../src/common/vertex_ai_launcher_test.py | 5 +- tests/unit/src/common/vertex_ai_test.py | 1 - tests/unit/src/training/glt_trainer_test.py | 3 +- tests/unit/src/training/v1_trainer_test.py | 3 +- ...source_config_compatibility_checks_test.py | 4 +- tests/unit/utils/tensorboard_writer_test.py | 48 ++- 16 files changed, 283 insertions(+), 162 deletions(-) diff --git a/docs/plans/20260505-tb-multi-job-iteration.md b/docs/plans/20260505-tb-multi-job-iteration.md index 20c6a52c5..0639252a0 100644 --- a/docs/plans/20260505-tb-multi-job-iteration.md +++ b/docs/plans/20260505-tb-multi-job-iteration.md @@ -1,93 +1,163 @@ # Multi-Job TensorBoard: Local Iteration & Final Design Plan -Date: 2026-05-05 -Branch: `kmonte/add-tb-for-glt` +Date: 2026-05-05 Branch: `kmonte/add-tb-for-glt` -This plan supersedes the earlier branch plan at `docs/plans/20260504-tb-experiment-name-proto.md`. It incorporates findings from two Codex plan reviews — round 1 at `.claude/tmp/codex-verify/20260505-155740-plan-crystalline-giggling-backus/review.md` and round 2 at `.claude/tmp/codex-verify/20260505-161326-plan-crystalline-giggling-backus/review.md`. Round-2 deltas (e.g. uniqueness via timestamp suffix, returning the `CustomJob` from `launch_single_pool_job`, `--container-uri` required, no commit of experiment name into the e2e CORA config) are applied during implementation, not via plan edits. +This plan supersedes the earlier branch plan at `docs/plans/20260504-tb-experiment-name-proto.md`. It incorporates +findings from two Codex plan reviews — round 1 at +`.claude/tmp/codex-verify/20260505-155740-plan-crystalline-giggling-backus/review.md` and round 2 at +`.claude/tmp/codex-verify/20260505-161326-plan-crystalline-giggling-backus/review.md`. Round-2 deltas (e.g. uniqueness +via timestamp suffix, returning the `CustomJob` from `launch_single_pool_job`, `--container-uri` required, no commit of +experiment name into the e2e CORA config) are applied during implementation, not via plan edits. ## Context -Across three full-pipeline iterations on this branch we've cycled through three TB integration designs, each broken in a different way: - -1. **`submit(tensorboard=…)`** — auto-uploader runs, but the destination `TensorboardExperiment` is named after the (numeric) `CustomJob` ID. Per-job page works (R1 ✓), but multiple jobs cannot share one TB page (R2 ✗). -2. **`submit(experiment=…)`** — never streams events. The SDK's `experiment=` is for Vertex AI Experiments parameter/metric tracking; Vertex's TB auto-uploader is gated on `jobSpec.tensorboard` being set, which `experiment=` is mutually exclusive with. Result: events written to `AIP_TENSORBOARD_LOG_DIR` sit in GCS un-uploaded. Job 6570151780682825728 confirmed this empirically. -3. **Custom uploader from chief rank, no `tensorboard=`** — events stream to the chosen experiment (R2 ✓), but the VAI job page no longer shows the "Open TensorBoard" link because that link is keyed on `jobSpec.tensorboard` (R1 ✗). Job 4543918976459079680 confirmed this. - -R1 (TB link from job page) and R2 (multi-job comparison) are not mutually exclusive — they just can't be satisfied by a single mechanism. The right approach combines both: server-side auto-uploader for the job-page link, plus a chief-rank uploader for the cross-job comparison experiment, pointing at two different `TensorboardExperiment`s under the same `Tensorboard` instance. Implementation is small; the risk is verifying behavior end-to-end. The fix for that is a tight local iteration loop. +Across three full-pipeline iterations on this branch we've cycled through three TB integration designs, each broken in a +different way: + +1. **`submit(tensorboard=…)`** — auto-uploader runs, but the destination `TensorboardExperiment` is named after the + (numeric) `CustomJob` ID. Per-job page works (R1 ✓), but multiple jobs cannot share one TB page (R2 ✗). +2. **`submit(experiment=…)`** — never streams events. The SDK's `experiment=` is for Vertex AI Experiments + parameter/metric tracking; Vertex's TB auto-uploader is gated on `jobSpec.tensorboard` being set, which `experiment=` + is mutually exclusive with. Result: events written to `AIP_TENSORBOARD_LOG_DIR` sit in GCS un-uploaded. Job + 6570151780682825728 confirmed this empirically. +3. **Custom uploader from chief rank, no `tensorboard=`** — events stream to the chosen experiment (R2 ✓), but the VAI + job page no longer shows the "Open TensorBoard" link because that link is keyed on `jobSpec.tensorboard` (R1 ✗). Job + 4543918976459079680 confirmed this. + +R1 (TB link from job page) and R2 (multi-job comparison) are not mutually exclusive — they just can't be satisfied by a +single mechanism. The right approach combines both: server-side auto-uploader for the job-page link, plus a chief-rank +uploader for the cross-job comparison experiment, pointing at two different `TensorboardExperiment`s under the same +`Tensorboard` instance. Implementation is small; the risk is verifying behavior end-to-end. The fix for that is a tight +local iteration loop. ## Success criteria -| ID | Criterion | How verified | -|----|-----------|--------------| -| R1 | The Vertex AI job UI shows "Open TensorBoard" for a successful job, and clicking it loads the per-job experiment with this job's scalar runs. | Manual: open the job in the cloud console; click the link. | -| R2 | Two jobs submitted with the same `tensorboardExperimentName` show **two distinct runs** on one TB page (the user-named experiment), each carrying its own scalars. | Manual: open the named experiment URL; toggle both runs in the scalars dashboard. Smoke script also asserts run count + ≥1 `TensorboardTimeSeries` per run. | -| R3 | Jobs without `tensorboardExperimentName` keep working: events flow to a per-job auto-named experiment. No regression. | Existing `tests/unit/src/common/vertex_ai_test.py::test_submit_job_passes_tensorboard_and_base_output_dir` plus a smoke run with the field unset. | -| R4 | `make unit_test_py` and `make type_check` pass on the branch. | CI / local. | -| R5 (process) | A new dev script lets us submit a tiny CustomJob from a laptop and verify R1+R2 in <2 minutes, end-to-end. | Run it twice; time both invocations. | -| R6 | Trainer process exits cleanly even when training fails — the chief-rank uploader does not hang the worker. | Inspected via the `try/finally` (or `with`) wrapping in all four training entrypoints; `make unit_test_py` covers the writer's idempotent close. | +| ID | Criterion | How verified | +| ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------- | +| R1 | The Vertex AI job UI shows "Open TensorBoard" for a successful job, and clicking it loads the per-job experiment with this job's scalar runs. | Manual: open the job in the cloud console; click the link. | +| R2 | Two jobs submitted with the same `tensorboardExperimentName` show **two distinct runs** on one TB page (the user-named experiment), each carrying its own scalars. | Manual: open the named experiment URL; toggle both runs in the scalars dashboard. Smoke script also asserts run count + ≥1 `TensorboardTimeSeries` per run. | +| R3 | Jobs without `tensorboardExperimentName` keep working: events flow to a per-job auto-named experiment. No regression. | Existing `tests/unit/src/common/vertex_ai_test.py::test_submit_job_passes_tensorboard_and_base_output_dir` plus a smoke run with the field unset. | +| R4 | `make unit_test_py` and `make type_check` pass on the branch. | CI / local. | +| R5 (process) | A new dev script lets us submit a tiny CustomJob from a laptop and verify R1+R2 in \<2 minutes, end-to-end. | Run it twice; time both invocations. | +| R6 | Trainer process exits cleanly even when training fails — the chief-rank uploader does not hang the worker. | Inspected via the `try/finally` (or `with`) wrapping in all four training entrypoints; `make unit_test_py` covers the writer's idempotent close. | ## Final design -**(A) Set `jobSpec.tensorboard=` on every job that has a TB resource configured (even when an experiment name is also set).** This restores the VAI job-page TB link unconditionally and continues to populate `AIP_TENSORBOARD_RESOURCE_NAME` and `AIP_TENSORBOARD_LOG_DIR` in the worker. Vertex's auto-uploader streams events to a per-job experiment named after the job's numeric ID — that's R1. +**(A) Set `jobSpec.tensorboard=` on every job that has a TB resource configured (even when an experiment name +is also set).** This restores the VAI job-page TB link unconditionally and continues to populate +`AIP_TENSORBOARD_RESOURCE_NAME` and `AIP_TENSORBOARD_LOG_DIR` in the worker. Vertex's auto-uploader streams events to a +per-job experiment named after the job's numeric ID — that's R1. **(B) When `tensorboard_experiment_name` is set, the launcher injects three env vars:** - `GIGL_TENSORBOARD_RESOURCE_NAME` — full Tensorboard resource name (already injected at HEAD). - `GIGL_TENSORBOARD_EXPERIMENT_NAME` — the user-chosen experiment name (already injected at HEAD). -- `GIGL_TENSORBOARD_RUN_NAME` — **new**: derived from the launcher's `job_name`, with `_` → `-` (so the GCS subdir name matches what the SDK's `reformat_run_name` will produce). Codex Issue 1 fix. +- `GIGL_TENSORBOARD_RUN_NAME` — **new**: derived from the launcher's `job_name`, with `_` → `-` (so the GCS subdir name + matches what the SDK's `reformat_run_name` will produce). Codex Issue 1 fix. **(C) `TensorBoardWriter.from_env()` (chief rank only):** -- If `GIGL_TENSORBOARD_RUN_NAME` is set: write events to `//` (a *subdirectory*), not to the parent. This makes the run name visible to both the server-side auto-uploader and our chief-rank uploader as a `relpath` of the parent logdir, instead of the SDK's hardcoded `DEFAULT_RUN_NAME = "default"` (`.venv/lib/python3.11/site-packages/google/cloud/aiplatform/tensorboard/uploader_utils.py:44`). Two jobs with different run names → two distinct runs in the named experiment. Codex Issue 1 fix. +- If `GIGL_TENSORBOARD_RUN_NAME` is set: write events to `//` (a *subdirectory*), not + to the parent. This makes the run name visible to both the server-side auto-uploader and our chief-rank uploader as a + `relpath` of the parent logdir, instead of the SDK's hardcoded `DEFAULT_RUN_NAME = "default"` + (`.venv/lib/python3.11/site-packages/google/cloud/aiplatform/tensorboard/uploader_utils.py:44`). Two jobs with + different run names → two distinct runs in the named experiment. Codex Issue 1 fix. - If `GIGL_TENSORBOARD_RUN_NAME` is unset: write to `AIP_TENSORBOARD_LOG_DIR` directly (today's behavior, R3 path). -- If both `GIGL_TENSORBOARD_RESOURCE_NAME` and `GIGL_TENSORBOARD_EXPERIMENT_NAME` are also set, additionally `aiplatform.start_upload_tb_log(tensorboard_id=…, tensorboard_experiment_name=…, logdir=AIP_TENSORBOARD_LOG_DIR)` — the parent logdir, not the subdir, so the uploader's `LogdirLoader` discovers the subdir as a run via `os.path.relpath`. **Do not pass `run_name_prefix`** — the subdir already gives us the run identity, and a non-empty prefix would concatenate awkwardly with the discovered run name. +- If both `GIGL_TENSORBOARD_RESOURCE_NAME` and `GIGL_TENSORBOARD_EXPERIMENT_NAME` are also set, additionally + `aiplatform.start_upload_tb_log(tensorboard_id=…, tensorboard_experiment_name=…, logdir=AIP_TENSORBOARD_LOG_DIR)` — + the parent logdir, not the subdir, so the uploader's `LogdirLoader` discovers the subdir as a run via + `os.path.relpath`. **Do not pass `run_name_prefix`** — the subdir already gives us the run identity, and a non-empty + prefix would concatenate awkwardly with the discovered run name. - `close()` already pairs with `aiplatform.end_upload_tb_log()` (`gigl/utils/tensorboard_writer.py:149`). -**(D) Always use `with TensorBoardWriter.from_env(...)` in trainer entrypoints.** The SDK uploader thread is **not** a daemon (`.venv/lib/python3.11/site-packages/google/cloud/aiplatform/tensorboard/uploader_tracker.py:162` — `threading.Thread(...).start()` without `daemon=True`); the SDK's docstring explicitly says to call `end_upload_tb_log()` in `finally` (`uploader_tracker.py:109`). Today's example trainers call `close()` only on the happy path. Codex Issue 3 fix: switch all four trainers to context-manager use. +**(D) Always use `with TensorBoardWriter.from_env(...)` in trainer entrypoints.** The SDK uploader thread is **not** a +daemon (`.venv/lib/python3.11/site-packages/google/cloud/aiplatform/tensorboard/uploader_tracker.py:162` — +`threading.Thread(...).start()` without `daemon=True`); the SDK's docstring explicitly says to call +`end_upload_tb_log()` in `finally` (`uploader_tracker.py:109`). Today's example trainers call `close()` only on the +happy path. Codex Issue 3 fix: switch all four trainers to context-manager use. -The `submit(experiment=…)` SDK path and the `_ensure_experiment_with_backing_tb` helper are not needed for either requirement; both are gone as of HEAD `e19f1050`. +The `submit(experiment=…)` SDK path and the `_ensure_experiment_with_backing_tb` helper are not needed for either +requirement; both are gone as of HEAD `e19f1050`. ## Files to modify -- `gigl/common/services/vertex_ai.py` — `_submit_job`: drop the experiment-name early branch; always set `tensorboard=` whenever `job_config.tensorboard_resource_name` is non-empty. Keep the experiment-name regex validation (fail-fast). Update the `VertexAiJobConfig` docstring around `gigl/common/services/vertex_ai.py:150` (Codex Issue 6). -- `gigl/src/common/vertex_ai_launcher.py` — `_build_job_config`: keep the existing `GIGL_TENSORBOARD_RESOURCE_NAME` / `GIGL_TENSORBOARD_EXPERIMENT_NAME` injection; **add** `GIGL_TENSORBOARD_RUN_NAME` (sanitized job name). Update the comment block at `gigl/src/common/vertex_ai_launcher.py:300` describing what `_submit_job` does (Codex Issue 6). -- `gigl/utils/tensorboard_writer.py` — `from_env()` reads `GIGL_TENSORBOARD_RUN_NAME` and uses it as a subdir of `AIP_TENSORBOARD_LOG_DIR` for the `tf.summary.create_file_writer` log_dir; `_maybe_start_uploader` still watches the parent logdir. -- `proto/snapchat/research/gbml/gbml_config.proto:204` — update the `tensorboard_experiment_name` comment to describe the dual-uploader behavior, not the dropped `experiment=`-backed design (Codex Issue 6). Run `make compile_protos` to regenerate Python + Scala stubs. -- `examples/link_prediction/configs/e2e_hom_cora_sup_task_config.yaml:26` — change `tensorboardExperimentName` from the personal `kmonte-test-experiment` to `homogeneous-link-prediction-comparison` (Codex Issue 5). -- `examples/link_prediction/homogeneous_training.py`, `examples/link_prediction/heterogeneous_training.py`, `examples/link_prediction/graph_store/homogeneous_training.py`, `examples/link_prediction/graph_store/heterogeneous_training.py` — replace the existing `tensorboard_writer = TensorBoardWriter.from_env(...)` + later `.close()` pattern with a `with` block. (Codex Issue 3 + Impact Analysis.) -- `tests/unit/src/common/vertex_ai_test.py` — rename `test_submit_job_skips_experiment_and_tensorboard_when_experiment_name_set` to `test_submit_job_passes_tensorboard_with_or_without_experiment_name` and assert `tensorboard=` is set in both branches. -- `tests/unit/src/common/vertex_ai_launcher_test.py` — assert `GIGL_TENSORBOARD_RUN_NAME` is injected when an experiment name is set; not injected otherwise. -- `tests/unit/utils/tensorboard_writer_test.py` — assert the writer's effective `log_dir` is the subdir (`//`) when `GIGL_TENSORBOARD_RUN_NAME` is set; assert `start_upload_tb_log` is called with `logdir=` (NOT the subdir) and no `run_name_prefix`. -- `python -m gigl.utils.dev.submit_smoke_job` — **new** local iteration tool. The `tools/` directory already exists in the repo (Codex correction). +- `gigl/common/services/vertex_ai.py` — `_submit_job`: drop the experiment-name early branch; always set + `tensorboard=` whenever `job_config.tensorboard_resource_name` is non-empty. Keep the experiment-name regex + validation (fail-fast). Update the `VertexAiJobConfig` docstring around `gigl/common/services/vertex_ai.py:150` (Codex + Issue 6). +- `gigl/src/common/vertex_ai_launcher.py` — `_build_job_config`: keep the existing `GIGL_TENSORBOARD_RESOURCE_NAME` / + `GIGL_TENSORBOARD_EXPERIMENT_NAME` injection; **add** `GIGL_TENSORBOARD_RUN_NAME` (sanitized job name). Update the + comment block at `gigl/src/common/vertex_ai_launcher.py:300` describing what `_submit_job` does (Codex Issue 6). +- `gigl/utils/tensorboard_writer.py` — `from_env()` reads `GIGL_TENSORBOARD_RUN_NAME` and uses it as a subdir of + `AIP_TENSORBOARD_LOG_DIR` for the `tf.summary.create_file_writer` log_dir; `_maybe_start_uploader` still watches the + parent logdir. +- `proto/snapchat/research/gbml/gbml_config.proto:204` — update the `tensorboard_experiment_name` comment to describe + the dual-uploader behavior, not the dropped `experiment=`-backed design (Codex Issue 6). Run `make compile_protos` to + regenerate Python + Scala stubs. +- `examples/link_prediction/configs/e2e_hom_cora_sup_task_config.yaml:26` — change `tensorboardExperimentName` from the + personal `kmonte-test-experiment` to `homogeneous-link-prediction-comparison` (Codex Issue 5). +- `examples/link_prediction/homogeneous_training.py`, `examples/link_prediction/heterogeneous_training.py`, + `examples/link_prediction/graph_store/homogeneous_training.py`, + `examples/link_prediction/graph_store/heterogeneous_training.py` — replace the existing + `tensorboard_writer = TensorBoardWriter.from_env(...)` + later `.close()` pattern with a `with` block. (Codex Issue 3 + \+ Impact Analysis.) +- `tests/unit/src/common/vertex_ai_test.py` — rename + `test_submit_job_skips_experiment_and_tensorboard_when_experiment_name_set` to + `test_submit_job_passes_tensorboard_with_or_without_experiment_name` and assert `tensorboard=` is set in both + branches. +- `tests/unit/src/common/vertex_ai_launcher_test.py` — assert `GIGL_TENSORBOARD_RUN_NAME` is injected when an experiment + name is set; not injected otherwise. +- `tests/unit/utils/tensorboard_writer_test.py` — assert the writer's effective `log_dir` is the subdir + (`//`) when `GIGL_TENSORBOARD_RUN_NAME` is set; assert `start_upload_tb_log` is called with + `logdir=` (NOT the subdir) and no `run_name_prefix`. +- `python -m gigl.utils.dev.submit_smoke_job` — **new** local iteration tool. The `tools/` directory already exists in + the repo (Codex correction). ## Local iteration tool -A standalone Python script that bypasses ConfigPopulator and the full pipeline. Goal: <2 min from "I changed code" to "I see whether TB shows up." +A standalone Python script that bypasses ConfigPopulator and the full pipeline. Goal: \<2 min from "I changed code" to +"I see whether TB shows up." Path: `python -m gigl.utils.dev.submit_smoke_job`. What it does: -1. **Use the production launcher path** (`gigl.src.common.vertex_ai_launcher.launch_single_pool_job`) — *not* `VertexAIService.launch_job` directly — so the same `_build_job_config` env-var injection runs as in production. Codex Issue 2 fix. +1. **Use the production launcher path** (`gigl.src.common.vertex_ai_launcher.launch_single_pool_job`) — *not* + `VertexAIService.launch_job` directly — so the same `_build_job_config` env-var injection runs as in production. + Codex Issue 2 fix. 2. Constructs a small `VertexAiResourceConfig` proto inline: - - `machine_type="n1-standard-2"`, `gpu_type="ACCELERATOR_TYPE_UNSPECIFIED"`, `gpu_limit=0`, `num_replicas=1`, `tensorboard_resource_name=`. -3. Constructs a small `GiglResourceConfig` proto with that trainer config + `shared_resource_config.common_compute_config` populated from CLI flags. + - `machine_type="n1-standard-2"`, `gpu_type="ACCELERATOR_TYPE_UNSPECIFIED"`, `gpu_limit=0`, `num_replicas=1`, + `tensorboard_resource_name=`. +3. Constructs a small `GiglResourceConfig` proto with that trainer config + + `shared_resource_config.common_compute_config` populated from CLI flags. 4. Calls `launch_single_pool_job(...)` with: - - `process_command="python -m gigl.utils.dev.tb_smoke_main"` — a tiny module added in the same commit; reads env vars, instantiates `TensorBoardWriter.from_env(enabled=True)`, writes 3 scalar events at steps 0/1/2, sleeps ~30s, exits. - - `tensorboard_logs_uri = GcsUri("gs:///tb-smoke//logs/")` — drives `base_output_dir` via the existing helper at `gigl/src/common/vertex_ai_launcher.py:_get_base_output_dir_from_tensorboard_logs_uri`. + - `process_command="python -m gigl.utils.dev.tb_smoke_main"` — a tiny module added in the same commit; reads env + vars, instantiates `TensorBoardWriter.from_env(enabled=True)`, writes 3 scalar events at steps 0/1/2, sleeps ~30s, + exits. + - `tensorboard_logs_uri = GcsUri("gs:///tb-smoke//logs/")` — drives `base_output_dir` via the + existing helper at `gigl/src/common/vertex_ai_launcher.py:_get_base_output_dir_from_tensorboard_logs_uri`. - `tensorboard_experiment_name` from a CLI flag (or `None`). 5. After completion, queries the Vertex AI APIs: - - `aiplatform.TensorboardExperiment.list(tensorboard_name=)` (`tensorboard_resource.py:518`) to confirm both expected experiments exist (the per-job auto-named one always; the user-named one only when the experiment-name flag was passed). - - For each expected run, `aiplatform.TensorboardTimeSeries.list(tensorboard_run_name=)` (`tensorboard_resource.py:1264`) to confirm at least one scalar tag exists. Codex Issue 4 fix — `TensorboardRun.list` alone only confirms run *existence*, not that scalars were ingested. + - `aiplatform.TensorboardExperiment.list(tensorboard_name=)` (`tensorboard_resource.py:518`) to confirm + both expected experiments exist (the per-job auto-named one always; the user-named one only when the + experiment-name flag was passed). + - For each expected run, `aiplatform.TensorboardTimeSeries.list(tensorboard_run_name=)` + (`tensorboard_resource.py:1264`) to confirm at least one scalar tag exists. Codex Issue 4 fix — + `TensorboardRun.list` alone only confirms run *existence*, not that scalars were ingested. 6. Prints both TB UI URLs (per-job and named) for manual inspection. -Required CLI flags: `--project`, `--region`, `--service-account`, `--staging-bucket`, `--tensorboard` (full resource name), and optional `--experiment-name`, `--container-uri` (defaults to `DEFAULT_GIGL_RELEASE_SRC_IMAGE_CPU` from `gigl/common/constants.py:69`), `--dry-run`. +Required CLI flags: `--project`, `--region`, `--service-account`, `--staging-bucket`, `--tensorboard` (full resource +name), and optional `--experiment-name`, `--container-uri` (defaults to `DEFAULT_GIGL_RELEASE_SRC_IMAGE_CPU` from +`gigl/common/constants.py:69`), `--dry-run`. Existing infrastructure to reuse: -- `gigl/src/common/vertex_ai_launcher.py:launch_single_pool_job` — production entry; running through this exercises env-var injection. + +- `gigl/src/common/vertex_ai_launcher.py:launch_single_pool_job` — production entry; running through this exercises + env-var injection. - `gigl/common/services/vertex_ai.py:VertexAiJobConfig` — config dataclass. - `gigl/utils/tensorboard_writer.py:TensorBoardWriter` — same writer the trainers use. -- `aiplatform.TensorboardExperiment.list` / `aiplatform.TensorboardRun.list` / `aiplatform.TensorboardTimeSeries.list` — verification surfaces. +- `aiplatform.TensorboardExperiment.list` / `aiplatform.TensorboardRun.list` / `aiplatform.TensorboardTimeSeries.list` — + verification surfaces. - `DEFAULT_GIGL_RELEASE_SRC_IMAGE_CPU` from `gigl/common/constants.py:69` — default container image. ## Step-by-step plan @@ -97,13 +167,21 @@ Each step ends with a verification. ### Step 1: revert `_submit_job` to always pass `tensorboard=` and refresh stale comments Production code: -- `gigl/common/services/vertex_ai.py:_submit_job` — set `submit_kwargs["tensorboard"] = job_config.tensorboard_resource_name` whenever `job_config.tensorboard_resource_name` is non-empty, regardless of `tensorboard_experiment_name`. Keep the experiment-name regex validation gate. -- `gigl/common/services/vertex_ai.py:150` — update the `VertexAiJobConfig.tensorboard_experiment_name` docstring to describe "auxiliary chief-rank uploader streams events to this experiment in addition to the per-job auto-named one." + +- `gigl/common/services/vertex_ai.py:_submit_job` — set + `submit_kwargs["tensorboard"] = job_config.tensorboard_resource_name` whenever `job_config.tensorboard_resource_name` + is non-empty, regardless of `tensorboard_experiment_name`. Keep the experiment-name regex validation gate. +- `gigl/common/services/vertex_ai.py:150` — update the `VertexAiJobConfig.tensorboard_experiment_name` docstring to + describe "auxiliary chief-rank uploader streams events to this experiment in addition to the per-job auto-named one." - `gigl/src/common/vertex_ai_launcher.py:300` — update the comment block describing `_submit_job` behavior. -- `proto/snapchat/research/gbml/gbml_config.proto:204` — replace the `experiment=`-backed description with the new dual-uploader description; run `make compile_protos`. +- `proto/snapchat/research/gbml/gbml_config.proto:204` — replace the `experiment=`-backed description with the new + dual-uploader description; run `make compile_protos`. Tests: -- `tests/unit/src/common/vertex_ai_test.py` — rename `test_submit_job_skips_experiment_and_tensorboard_when_experiment_name_set` → `test_submit_job_passes_tensorboard_with_or_without_experiment_name`; assert `tensorboard=` is set in both branches. + +- `tests/unit/src/common/vertex_ai_test.py` — rename + `test_submit_job_skips_experiment_and_tensorboard_when_experiment_name_set` → + `test_submit_job_passes_tensorboard_with_or_without_experiment_name`; assert `tensorboard=` is set in both branches. Verify: `make unit_test_py PY_TEST_FILES="vertex_ai_test.py"` passes; `make type_check` is clean. @@ -112,59 +190,91 @@ Commit: `vertex_ai: always pass tensorboard= so VAI job page links to TB`. ### Step 2: inject `GIGL_TENSORBOARD_RUN_NAME` and consume it in the writer Production code: -- `gigl/src/common/vertex_ai_launcher.py:_build_job_config` — when `tensorboard_experiment_name` is set, also append `env_var.EnvVar(name="GIGL_TENSORBOARD_RUN_NAME", value=job_name.replace("_", "-"))` next to the existing two GIGL_TENSORBOARD_* env vars. (We pre-sanitize so the GCS subdir name and the SDK-derived run name agree.) -- `gigl/utils/tensorboard_writer.py:from_env` — if `GIGL_TENSORBOARD_RUN_NAME` is set, compute `effective_log_dir = os.path.join(AIP_TENSORBOARD_LOG_DIR, run_name)` and pass that to `tf.summary.create_file_writer`. Otherwise pass `AIP_TENSORBOARD_LOG_DIR` (today's behavior). -- `gigl/utils/tensorboard_writer.py:_maybe_start_uploader` — keep watching the **parent** `AIP_TENSORBOARD_LOG_DIR` (so the SDK's `LogdirLoader` discovers the run via `os.path.relpath(subdir, logdir)` as the subdir name). No `run_name_prefix`. + +- `gigl/src/common/vertex_ai_launcher.py:_build_job_config` — when `tensorboard_experiment_name` is set, also append + `env_var.EnvVar(name="GIGL_TENSORBOARD_RUN_NAME", value=job_name.replace("_", "-"))` next to the existing two + GIGL_TENSORBOARD\_\* env vars. (We pre-sanitize so the GCS subdir name and the SDK-derived run name agree.) +- `gigl/utils/tensorboard_writer.py:from_env` — if `GIGL_TENSORBOARD_RUN_NAME` is set, compute + `effective_log_dir = os.path.join(AIP_TENSORBOARD_LOG_DIR, run_name)` and pass that to + `tf.summary.create_file_writer`. Otherwise pass `AIP_TENSORBOARD_LOG_DIR` (today's behavior). +- `gigl/utils/tensorboard_writer.py:_maybe_start_uploader` — keep watching the **parent** `AIP_TENSORBOARD_LOG_DIR` (so + the SDK's `LogdirLoader` discovers the run via `os.path.relpath(subdir, logdir)` as the subdir name). No + `run_name_prefix`. Tests: -- `tests/unit/src/common/vertex_ai_launcher_test.py` — assert the GIGL_TENSORBOARD_RUN_NAME env var is injected when an experiment name is set; underscores in the job name become hyphens; not injected when experiment name is unset. -- `tests/unit/utils/tensorboard_writer_test.py` — when `GIGL_TENSORBOARD_RUN_NAME=my-run`: assert the writer's underlying file-writer was created for `/my-run/`; assert `start_upload_tb_log` called with `logdir=` and no `run_name_prefix`. When unset: writer uses parent dir directly (regression coverage for R3). -Verify: `make unit_test_py PY_TEST_FILES="vertex_ai_launcher_test.py"`; `make unit_test_py PY_TEST_FILES="tensorboard_writer_test.py"`. +- `tests/unit/src/common/vertex_ai_launcher_test.py` — assert the GIGL_TENSORBOARD_RUN_NAME env var is injected when an + experiment name is set; underscores in the job name become hyphens; not injected when experiment name is unset. +- `tests/unit/utils/tensorboard_writer_test.py` — when `GIGL_TENSORBOARD_RUN_NAME=my-run`: assert the writer's + underlying file-writer was created for `/my-run/`; assert `start_upload_tb_log` called with `logdir=` + and no `run_name_prefix`. When unset: writer uses parent dir directly (regression coverage for R3). + +Verify: `make unit_test_py PY_TEST_FILES="vertex_ai_launcher_test.py"`; +`make unit_test_py PY_TEST_FILES="tensorboard_writer_test.py"`. Commit: `tensorboard: emit unique run names so multi-job comparison shows two runs`. ### Step 3: harden trainer uploader lifecycle For each of: -- `examples/link_prediction/homogeneous_training.py` (`tensorboard_writer = TensorBoardWriter.from_env(...)` at line 364, `.close()` at line 621) + +- `examples/link_prediction/homogeneous_training.py` (`tensorboard_writer = TensorBoardWriter.from_env(...)` at line + 364, `.close()` at line 621) - `examples/link_prediction/heterogeneous_training.py` - `examples/link_prediction/graph_store/homogeneous_training.py` - `examples/link_prediction/graph_store/heterogeneous_training.py` -Replace the assignment + later `.close()` pattern with `with TensorBoardWriter.from_env(enabled=is_chief_process) as tensorboard_writer:` wrapping the body. The writer already supports `__enter__`/`__exit__`; this just guarantees `end_upload_tb_log` runs even when training raises. +Replace the assignment + later `.close()` pattern with +`with TensorBoardWriter.from_env(enabled=is_chief_process) as tensorboard_writer:` wrapping the body. The writer already +supports `__enter__`/`__exit__`; this just guarantees `end_upload_tb_log` runs even when training raises. -If the writer is used at module scope across many functions (and a single `with` block would force a large diff), wrap the function that owns the training loop in `try/finally` and call `tensorboard_writer.close()` in `finally`. +If the writer is used at module scope across many functions (and a single `with` block would force a large diff), wrap +the function that owns the training loop in `try/finally` and call `tensorboard_writer.close()` in `finally`. -Tests: existing `make unit_test_py PY_TEST_FILES="tensorboard_writer_test.py"` already covers idempotent close. No new unit tests required (these example scripts are not unit-tested today). +Tests: existing `make unit_test_py PY_TEST_FILES="tensorboard_writer_test.py"` already covers idempotent close. No new +unit tests required (these example scripts are not unit-tested today). -Verify: `make type_check`; manually re-read each modified entrypoint to confirm the writer's lifetime spans the entire training-loop scope. +Verify: `make type_check`; manually re-read each modified entrypoint to confirm the writer's lifetime spans the entire +training-loop scope. Commit: `examples: scope TensorBoardWriter to a context manager in all training entrypoints`. ### Step 4: write `python -m gigl.utils.dev.submit_smoke_job` + `gigl/utils/dev/tb_smoke_main.py` -- `gigl/utils/dev/tb_smoke_main.py`: new module. ~25 lines. Uses `TensorBoardWriter.from_env(enabled=True)` to write 3 scalar events (`{"smoke/value": float(step)}` at steps 0, 1, 2) inside a `with` block, then `time.sleep(30)` to let both uploaders flush. Module-level entry so it can be invoked with `python -m gigl.utils.dev.tb_smoke_main`. +- `gigl/utils/dev/tb_smoke_main.py`: new module. ~25 lines. Uses `TensorBoardWriter.from_env(enabled=True)` to write 3 + scalar events (`{"smoke/value": float(step)}` at steps 0, 1, 2) inside a `with` block, then `time.sleep(30)` to let + both uploaders flush. Module-level entry so it can be invoked with `python -m gigl.utils.dev.tb_smoke_main`. - `python -m gigl.utils.dev.submit_smoke_job`: new top-level script. - - argparse for `--project`, `--region`, `--service-account`, `--staging-bucket`, `--tensorboard`, optional `--experiment-name`, `--container-uri`, `--dry-run`. - - Builds `VertexAiResourceConfig` and `GiglResourceConfig` protos inline (mirror the patterns in `tests/unit/src/common/vertex_ai_launcher_test.py:_create_gigl_resource_config_with_single_pool_inference` for shape). - - Calls `launch_single_pool_job(... vertex_ai_region=, tensorboard_logs_uri=GcsUri("gs:///tb-smoke//logs/"), tensorboard_experiment_name=)`. + - argparse for `--project`, `--region`, `--service-account`, `--staging-bucket`, `--tensorboard`, optional + `--experiment-name`, `--container-uri`, `--dry-run`. + - Builds `VertexAiResourceConfig` and `GiglResourceConfig` protos inline (mirror the patterns in + `tests/unit/src/common/vertex_ai_launcher_test.py:_create_gigl_resource_config_with_single_pool_inference` for + shape). + - Calls + `launch_single_pool_job(... vertex_ai_region=, tensorboard_logs_uri=GcsUri("gs:///tb-smoke//logs/"), tensorboard_experiment_name=)`. - On `--dry-run`: print the resulting `VertexAiJobConfig` and exit 0. - On real run: wait via `service.launch_job` (synchronous), then poll the verification APIs: - - `aiplatform.TensorboardExperiment.list(tensorboard_name=)` — assert per-job experiment with the job's numeric ID exists; assert user-experiment exists iff flag passed. - - For each expected experiment: `aiplatform.TensorboardRun.list(tensorboard_experiment_name=)` — assert at least one run, and (for `--experiment-name` mode) that the run name matches the sanitized job name. - - For each expected run: `aiplatform.TensorboardTimeSeries.list(tensorboard_run_name=)` — assert at least one time series with at least one tag (Codex Issue 4 fix). + - `aiplatform.TensorboardExperiment.list(tensorboard_name=)` — assert per-job experiment with the job's + numeric ID exists; assert user-experiment exists iff flag passed. + - For each expected experiment: `aiplatform.TensorboardRun.list(tensorboard_experiment_name=)` — + assert at least one run, and (for `--experiment-name` mode) that the run name matches the sanitized job name. + - For each expected run: `aiplatform.TensorboardTimeSeries.list(tensorboard_run_name=)` — assert + at least one time series with at least one tag (Codex Issue 4 fix). - Print both UI URLs. -Verify (offline): `python python -m gigl.utils.dev.submit_smoke_job --dry-run --project=… --region=… --service-account=… --staging-bucket=gs://… --tensorboard=projects/…/tensorboards/… --experiment-name=tb-smoke-multi` prints the `VertexAiJobConfig` and exits 0 without touching GCP. +Verify (offline): +`python python -m gigl.utils.dev.submit_smoke_job --dry-run --project=… --region=… --service-account=… --staging-bucket=gs://… --tensorboard=projects/…/tensorboards/… --experiment-name=tb-smoke-multi` +prints the `VertexAiJobConfig` and exits 0 without touching GCP. Commit: `tools: add dev_submit_tb_smoke_job + tb_smoke_main for fast TB iteration`. ### Step 5: smoke-validate R1 + R3 (no experiment name) Run the smoke script without `--experiment-name`. After completion (≤2 min): + - The Vertex AI job UI for the run shows "Open TensorBoard"; clicking it loads the per-job experiment (R1). -- The per-job experiment exists with one run named `default` (R3 — no `GIGL_TENSORBOARD_RUN_NAME` injected, the writer falls back to writing to the parent logdir). +- The per-job experiment exists with one run named `default` (R3 — no `GIGL_TENSORBOARD_RUN_NAME` injected, the writer + falls back to writing to the parent logdir). - No experiment with the user-named slug exists. If R3 fails, suspect Step 1's submit-kwargs change. The smoke loop iteration is the diagnostic surface. @@ -172,16 +282,21 @@ If R3 fails, suspect Step 1's submit-kwargs change. The smoke loop iteration is ### Step 6: smoke-validate R1 + R2 (with experiment name) Run twice with the same flag: `--experiment-name=tb-smoke-multi`. After both complete: + - Both job pages still show working "Open TensorBoard" links (R1). - Two per-job experiments exist (one per job, auto-named). - The `tb-smoke-multi` experiment exists with **two runs**, named after each sanitized job name. - Each of those runs has at least one `TensorboardTimeSeries` for the `smoke/value` tag. -If R2 fails (e.g., one merged run instead of two), suspect Step 2's run-name plumbing — iterate within the smoke loop, not the full pipeline. +If R2 fails (e.g., one merged run instead of two), suspect Step 2's run-name plumbing — iterate within the smoke loop, +not the full pipeline. ### Step 7: full-pipeline regression test -With R1 + R2 verified at the smoke layer, kick off one real homogeneous training run with `tensorboardExperimentName: "homogeneous-link-prediction-comparison"` (the value updated in Step 1's config edit, Codex Issue 5). Verify: +With R1 + R2 verified at the smoke layer, kick off one real homogeneous training run with +`tensorboardExperimentName: "homogeneous-link-prediction-comparison"` (the value updated in Step 1's config edit, Codex +Issue 5). Verify: + - "Open TensorBoard" link works on the job page (R1). - The named experiment shows the run with all trainer scalar tags (R2). @@ -195,43 +310,53 @@ With R1 + R2 verified at the smoke layer, kick off one real homogeneous training ### Step 0 (close-out, runs after exit-plan-mode): relocate this plan to `docs/plans/` -`mv /home/kmontemayor/.claude/plans/crystalline-giggling-backus.md docs/plans/20260505-tb-multi-job-iteration.md` — and add a note in the new file's header pointing at the supersedence relationship with `docs/plans/20260504-tb-experiment-name-proto.md`. Per CLAUDE.md plan conventions (`CLAUDE.md:252`, Codex Issue 7). +`mv /home/kmontemayor/.claude/plans/crystalline-giggling-backus.md docs/plans/20260505-tb-multi-job-iteration.md` — and +add a note in the new file's header pointing at the supersedence relationship with +`docs/plans/20260504-tb-experiment-name-proto.md`. Per CLAUDE.md plan conventions (`CLAUDE.md:252`, Codex Issue 7). ## Verification summary -| Step | Type | Cost | What it proves | -|------|------|------|----------------| -| 1, 2 | Unit tests + `type_check` | seconds | Code paths aren't broken; env-var injection + writer subdir wiring correct | -| 3 | Read-through + `type_check` | seconds | Lifecycle hardening compiles | -| 4 | `--dry-run` of smoke script | seconds | Script wires correctly without submitting | -| 5 | One smoke run (no experiment-name) | ~1–2 min | R1 + R3 | -| 6 | Two smoke runs (same experiment-name) | ~3–4 min | R1 + R2 (run identity, scalar ingestion) | -| 7 | One real homogeneous training run | ~5–15 min | Full pipeline + R1 + R2 | +| Step | Type | Cost | What it proves | +| ---- | ------------------------------------- | --------- | -------------------------------------------------------------------------- | +| 1, 2 | Unit tests + `type_check` | seconds | Code paths aren't broken; env-var injection + writer subdir wiring correct | +| 3 | Read-through + `type_check` | seconds | Lifecycle hardening compiles | +| 4 | `--dry-run` of smoke script | seconds | Script wires correctly without submitting | +| 5 | One smoke run (no experiment-name) | ~1–2 min | R1 + R3 | +| 6 | Two smoke runs (same experiment-name) | ~3–4 min | R1 + R2 (run identity, scalar ingestion) | +| 7 | One real homogeneous training run | ~5–15 min | Full pipeline + R1 + R2 | Total budget for design-and-verify: ~30 minutes of cluster time. ## Risks & open questions -- **The chief-rank uploader thread is not a daemon** (`uploader_tracker.py:162`). Process exit will not reap it; `end_upload_tb_log()` MUST be called. Step 3 enforces this via `with` blocks in all four trainer entrypoints. Codex Issue 3 fix — the original plan's claim that "the SDK's uploader thread is daemon" was wrong. -- **Race between two uploaders on the same logdir.** Both uploaders read events from GCS; neither writes. Each maintains its own `LogdirLoader` state. No conflict observed in the SDK source. Step 5 + 6 confirm in practice. +- **The chief-rank uploader thread is not a daemon** (`uploader_tracker.py:162`). Process exit will not reap it; + `end_upload_tb_log()` MUST be called. Step 3 enforces this via `with` blocks in all four trainer entrypoints. Codex + Issue 3 fix — the original plan's claim that "the SDK's uploader thread is daemon" was wrong. +- **Race between two uploaders on the same logdir.** Both uploaders read events from GCS; neither writes. Each maintains + its own `LogdirLoader` state. No conflict observed in the SDK source. Step 5 + 6 confirm in practice. - **Quota.** Two uploaders ≈ 2× ingestion request rate per opt-in job. Acceptable; revisit only on 429s. -- **GCS subdir vs logdir parent.** The chief-rank uploader watches `AIP_TENSORBOARD_LOG_DIR` (parent) and discovers the run as the subdir name. The server-side auto-uploader does the same. If we ever switch to writing events directly at the parent (no subdir), R2 collapses back to a single `default` run. Step 2's tests pin both ends. -- **`make compile_protos` regenerates Scala stubs as well.** The proto comment update in Step 1 will create a noisy diff in `scala/...` and `scala_spark35/...`. Acceptable. +- **GCS subdir vs logdir parent.** The chief-rank uploader watches `AIP_TENSORBOARD_LOG_DIR` (parent) and discovers the + run as the subdir name. The server-side auto-uploader does the same. If we ever switch to writing events directly at + the parent (no subdir), R2 collapses back to a single `default` run. Step 2's tests pin both ends. +- **`make compile_protos` regenerates Scala stubs as well.** The proto comment update in Step 1 will create a noisy diff + in `scala/...` and `scala_spark35/...`. Acceptable. ## Roll-back -If Steps 5 or 6 fail and the chief-rank uploader is the cause, set just `tensorboard=` on submit and stop injecting any `GIGL_TENSORBOARD_*` env vars. Falls back to R1-only (per-job TB), losing R2 — back to the state before this branch, with no regression. +If Steps 5 or 6 fail and the chief-rank uploader is the cause, set just `tensorboard=` on submit and stop +injecting any `GIGL_TENSORBOARD_*` env vars. Falls back to R1-only (per-job TB), losing R2 — back to the state before +this branch, with no regression. ## Codex review traceability Issues 1–7 from `.claude/tmp/codex-verify/20260505-155740-plan-crystalline-giggling-backus/review.md`: -| Issue | Severity | Addressed in | -|-------|----------|--------------| -| 1 — Run identity collapse | High | Step 2 (subdir-based run names, no `run_name_prefix`) | -| 2 — Smoke script bypasses env injection | High | Step 4 (smoke script uses `launch_single_pool_job`) | -| 3 — Uploader thread not daemon | High | Step 3 (`with` wrapping in all four trainers) | -| 4 — TimeSeries verification | Medium | Step 4 (smoke script asserts `TensorboardTimeSeries.list`) | -| 5 — Wrong experiment-name in Step 5 | Medium | Step 1 (config update from `kmonte-test-experiment` → `homogeneous-link-prediction-comparison`) | -| 6 — Stale comments / proto doc | Low | Step 1 (vertex_ai.py:150, vertex_ai_launcher.py:300, gbml_config.proto:204) | -| 7 — Plan-file location convention | Low | Step 0 (move to `docs/plans/20260505-tb-multi-job-iteration.md`) | +| Issue | Severity | Addressed in | +| --------------------------------------- | -------- | ----------------------------------------------------------------------------------------------- | +| 1 — Run identity collapse | High | Step 2 (subdir-based run names, no `run_name_prefix`) | +| 2 — Smoke script bypasses env injection | High | Step 4 (smoke script uses `launch_single_pool_job`) | +| 3 — Uploader thread not daemon | High | Step 3 (`with` wrapping in all four trainers) | +| 4 — TimeSeries verification | Medium | Step 4 (smoke script asserts `TensorboardTimeSeries.list`) | +| 5 — Wrong experiment-name in Step 5 | Medium | Step 1 (config update from `kmonte-test-experiment` → `homogeneous-link-prediction-comparison`) | +| 6 — Stale comments / proto doc | Low | Step 1 (vertex_ai.py:150, vertex_ai_launcher.py:300, gbml_config.proto:204) | +| 7 — Plan-file location convention | Low | Step 0 (move to `docs/plans/20260505-tb-multi-job-iteration.md`) | diff --git a/examples/link_prediction/README.md b/examples/link_prediction/README.md index f9f557caf..9c1a5ba9b 100644 --- a/examples/link_prediction/README.md +++ b/examples/link_prediction/README.md @@ -25,14 +25,11 @@ through running each component: `config_populator` -> `data_preprocessor` -> `tr ## Vertex AI TensorBoard -The example trainer configs enable TensorBoard logging with -`trainerConfig.shouldLogToTensorboard: true`. - -To surface those events in Vertex AI TensorBoard, set -`tensorboard_resource_name` on the trainer Vertex resource config, use a -regional bucket, and keep the bucket, CustomJob, and TensorBoard instance in -the same region. The attached service account should have -`roles/storage.admin` and `roles/aiplatform.user`. +The example trainer configs enable TensorBoard logging with `trainerConfig.shouldLogToTensorboard: true`. + +To surface those events in Vertex AI TensorBoard, set `tensorboard_resource_name` on the trainer Vertex resource config, +use a regional bucket, and keep the bucket, CustomJob, and TensorBoard instance in the same region. The attached service +account should have `roles/storage.admin` and `roles/aiplatform.user`. ```{toctree} :maxdepth: 2 diff --git a/examples/link_prediction/configs/e2e_hom_cora_sup_task_config.yaml b/examples/link_prediction/configs/e2e_hom_cora_sup_task_config.yaml index d552a6aed..e8716ae85 100644 --- a/examples/link_prediction/configs/e2e_hom_cora_sup_task_config.yaml +++ b/examples/link_prediction/configs/e2e_hom_cora_sup_task_config.yaml @@ -18,12 +18,12 @@ trainerConfig: log_every_n_batch: "50" # Frequency in which we log batch information num_neighbors: "[10, 10]" # Fanout per hop, specified as a string representation of a list for the homogeneous use case command: python -m examples.link_prediction.homogeneous_training - # Optional. When set, the trainer's CustomJob is submitted as a run of the - # named Vertex AI Experiment. Multiple jobs sharing this name appear as - # comparable runs on a single TensorBoard page. Requires - # GiglResourceConfig.trainerResourceConfig...tensorboardResourceName to be - # set. See proto/snapchat/research/gbml/gbml_config.proto for details. - tensorboardExperimentName: "kmonte-test-experiment" + # To enable cross-job TensorBoard comparison, override + # ``trainerConfig.tensorboardExperimentName`` in your own task config and + # configure ``GiglResourceConfig.trainerResourceConfig...tensorboardResourceName``. + # Left unset here so the default e2e CORA test stays compatible with + # resource configs that don't include a TensorBoard instance. See + # ``proto/snapchat/research/gbml/gbml_config.proto`` for details. inferencerConfig: inferencerArgs: # Example argument to inferencer diff --git a/examples/link_prediction/graph_store/heterogeneous_training.py b/examples/link_prediction/graph_store/heterogeneous_training.py index e8cb74d0c..5c6019973 100644 --- a/examples/link_prediction/graph_store/heterogeneous_training.py +++ b/examples/link_prediction/graph_store/heterogeneous_training.py @@ -506,7 +506,9 @@ def _training_process( ) val_main_loader_iter = InfiniteIterator(val_main_loader) - val_random_negative_loader_iter = InfiniteIterator(val_random_negative_loader) + val_random_negative_loader_iter = InfiniteIterator( + val_random_negative_loader + ) model = init_example_gigl_heterogeneous_model( node_type_to_feature_dim=args.node_type_to_feature_dim, diff --git a/examples/link_prediction/graph_store/homogeneous_training.py b/examples/link_prediction/graph_store/homogeneous_training.py index 1e658315a..e77039fc3 100644 --- a/examples/link_prediction/graph_store/homogeneous_training.py +++ b/examples/link_prediction/graph_store/homogeneous_training.py @@ -495,7 +495,9 @@ def _training_process( ) val_main_loader_iter = InfiniteIterator(val_main_loader) - val_random_negative_loader_iter = InfiniteIterator(val_random_negative_loader) + val_random_negative_loader_iter = InfiniteIterator( + val_random_negative_loader + ) model = init_example_gigl_homogeneous_model( node_feature_dim=args.node_feature_dim, diff --git a/examples/link_prediction/heterogeneous_training.py b/examples/link_prediction/heterogeneous_training.py index 6a97f1875..23b7f0f17 100644 --- a/examples/link_prediction/heterogeneous_training.py +++ b/examples/link_prediction/heterogeneous_training.py @@ -449,7 +449,9 @@ def _training_process( # We keep track of both the dataloader and the iterator for it # so we can clean up resources from the dataloader later. val_main_loader_iter = InfiniteIterator(val_main_loader) - val_random_negative_loader_iter = InfiniteIterator(val_random_negative_loader) + val_random_negative_loader_iter = InfiniteIterator( + val_random_negative_loader + ) model = init_example_gigl_heterogeneous_model( node_type_to_feature_dim=args.node_type_to_feature_dim, edge_type_to_feature_dim=args.edge_type_to_feature_dim, @@ -651,7 +653,11 @@ def _training_process( # These get written to some JSON uder the gcs:////trainer/trainer_eval_metrics.json # And then the "Log Trainer Eval Metrics" component in the KFP pipeline UI will log them to the UI, # as a metrics artifact. - if args.machine_rank == 0 and local_rank == 0 and args.eval_metrics_uri is not None: + if ( + args.machine_rank == 0 + and local_rank == 0 + and args.eval_metrics_uri is not None + ): eval_metrics = EvalMetricsCollection( metrics=[ EvalMetric.from_eval_metric_type( diff --git a/examples/link_prediction/homogeneous_training.py b/examples/link_prediction/homogeneous_training.py index c7772f9ec..cf0cee582 100644 --- a/examples/link_prediction/homogeneous_training.py +++ b/examples/link_prediction/homogeneous_training.py @@ -406,7 +406,9 @@ def _training_process( # We keep track of both the dataloader and the iterator for it # so we can clean up resources from the dataloader later. val_main_loader_iter = InfiniteIterator(val_main_loader) - val_random_negative_loader_iter = InfiniteIterator(val_random_negative_loader) + val_random_negative_loader_iter = InfiniteIterator( + val_random_negative_loader + ) model = init_example_gigl_homogeneous_model( node_feature_dim=args.node_feature_dim, @@ -604,7 +606,11 @@ def _training_process( # These get written to some JSON under the gcs:////trainer/trainer_eval_metrics.json # And then the "Log Trainer Eval Metrics" component in the KFP pipeline UI will log them to the UI, # as a metrics artifact. - if args.machine_rank == 0 and local_rank == 0 and args.eval_metrics_uri is not None: + if ( + args.machine_rank == 0 + and local_rank == 0 + and args.eval_metrics_uri is not None + ): eval_metrics = EvalMetricsCollection( metrics=[ EvalMetric.from_eval_metric_type( diff --git a/gigl/src/training/v1/trainer.py b/gigl/src/training/v1/trainer.py index 2ecd89556..9d3655e98 100644 --- a/gigl/src/training/v1/trainer.py +++ b/gigl/src/training/v1/trainer.py @@ -56,7 +56,8 @@ def run( else None ) tensorboard_experiment_name = ( - gbml_config_pb_wrapper.trainer_config.tensorboard_experiment_name or None + gbml_config_pb_wrapper.trainer_config.tensorboard_experiment_name + or None ) launch_single_pool_job( vertex_ai_resource_config=trainer_config, diff --git a/gigl/src/validation_check/libs/gbml_and_resource_config_compatibility_checks.py b/gigl/src/validation_check/libs/gbml_and_resource_config_compatibility_checks.py index 9920196d3..ff124615a 100644 --- a/gigl/src/validation_check/libs/gbml_and_resource_config_compatibility_checks.py +++ b/gigl/src/validation_check/libs/gbml_and_resource_config_compatibility_checks.py @@ -130,9 +130,7 @@ def check_vertex_ai_trainer_tensorboard_compatibility( elif isinstance( trainer_resource_config, gigl_resource_config_pb2.VertexAiGraphStoreConfig ): - tb_resource = ( - trainer_resource_config.compute_pool.tensorboard_resource_name - ) + tb_resource = trainer_resource_config.compute_pool.tensorboard_resource_name else: tb_resource = "" assert tb_resource, ( diff --git a/gigl/utils/dev/submit_smoke_job.py b/gigl/utils/dev/submit_smoke_job.py index 69624d9f0..8aead9f3b 100644 --- a/gigl/utils/dev/submit_smoke_job.py +++ b/gigl/utils/dev/submit_smoke_job.py @@ -150,9 +150,7 @@ def _verify_per_job_experiment( job_id: str, ) -> None: """The auto-uploader names its TensorboardExperiment after the job's numeric ID.""" - experiment_resource_name = ( - f"{tensorboard_resource_name}/experiments/{job_id}" - ) + experiment_resource_name = f"{tensorboard_resource_name}/experiments/{job_id}" runs = aiplatform.TensorboardRun.list( tensorboard_experiment_name=experiment_resource_name, ) @@ -217,9 +215,7 @@ def _print_tb_urls( experiment_name: Optional[str], ) -> None: base = f"https://{region}.tensorboard.googleusercontent.com/experiment" - qualifier = ( - f"projects+{project}+locations+{region}+tensorboards+{tensorboard_id}" - ) + qualifier = f"projects+{project}+locations+{region}+tensorboards+{tensorboard_id}" per_job = f"{base}/{qualifier}+experiments+{job_id}" logger.info(f"Per-job TB URL: {per_job}") if experiment_name: diff --git a/tests/unit/src/common/vertex_ai_launcher_test.py b/tests/unit/src/common/vertex_ai_launcher_test.py index b71c06889..5409951e6 100644 --- a/tests/unit/src/common/vertex_ai_launcher_test.py +++ b/tests/unit/src/common/vertex_ai_launcher_test.py @@ -336,7 +336,6 @@ def test_launch_inference_single_pool_cpu(self, mock_vertex_ai_service_class): } self.assertEqual(job_config.labels, expected_labels) - @patch("gigl.src.common.vertex_ai_launcher.VertexAIService") def test_launch_single_pool_job_threads_experiment_name( self, mock_vertex_ai_service_class @@ -523,9 +522,7 @@ def test_build_job_config_injects_gigl_tensorboard_env_vars(self) -> None: # GIGL_TENSORBOARD_RUN_NAME must be sanitized (underscores in the # job_name become hyphens) and carry a launch-unique timestamp suffix. run_name = env["GIGL_TENSORBOARD_RUN_NAME"] - self.assertRegex( - run_name, r"^gigl-train-some-task-\d{8}-\d{6}$" - ) + self.assertRegex(run_name, r"^gigl-train-some-task-\d{8}-\d{6}$") def test_build_job_config_run_name_is_unique_per_call(self) -> None: """Two builds of the same job_name produce two distinct run names.""" diff --git a/tests/unit/src/common/vertex_ai_test.py b/tests/unit/src/common/vertex_ai_test.py index a811cebb4..470005dac 100644 --- a/tests/unit/src/common/vertex_ai_test.py +++ b/tests/unit/src/common/vertex_ai_test.py @@ -65,7 +65,6 @@ def test_submit_job_passes_tensorboard_and_base_output_dir( ) self.assertNotIn("experiment", submit_kwargs) - def test_vertex_ai_job_config_carries_experiment_name(self) -> None: cfg = VertexAiJobConfig( job_name="job", diff --git a/tests/unit/src/training/glt_trainer_test.py b/tests/unit/src/training/glt_trainer_test.py index a6190adec..f77f2847e 100644 --- a/tests/unit/src/training/glt_trainer_test.py +++ b/tests/unit/src/training/glt_trainer_test.py @@ -5,8 +5,7 @@ from gigl.common import UriFactory from gigl.src.common.types import AppliedTaskIdentifier from gigl.src.training.v2.glt_trainer import GLTTrainer -from snapchat.research.gbml import gbml_config_pb2 -from snapchat.research.gbml import gigl_resource_config_pb2 +from snapchat.research.gbml import gbml_config_pb2, gigl_resource_config_pb2 from tests.test_assets.test_case import TestCase diff --git a/tests/unit/src/training/v1_trainer_test.py b/tests/unit/src/training/v1_trainer_test.py index f253c7dfa..70d3adb95 100644 --- a/tests/unit/src/training/v1_trainer_test.py +++ b/tests/unit/src/training/v1_trainer_test.py @@ -5,8 +5,7 @@ from gigl.common import UriFactory from gigl.src.common.types import AppliedTaskIdentifier from gigl.src.training.v1.trainer import Trainer -from snapchat.research.gbml import gbml_config_pb2 -from snapchat.research.gbml import gigl_resource_config_pb2 +from snapchat.research.gbml import gbml_config_pb2, gigl_resource_config_pb2 from tests.test_assets.test_case import TestCase diff --git a/tests/unit/src/validation/lib/gbml_and_resource_config_compatibility_checks_test.py b/tests/unit/src/validation/lib/gbml_and_resource_config_compatibility_checks_test.py index a09fd2b86..e0bcc44e8 100644 --- a/tests/unit/src/validation/lib/gbml_and_resource_config_compatibility_checks_test.py +++ b/tests/unit/src/validation/lib/gbml_and_resource_config_compatibility_checks_test.py @@ -328,7 +328,9 @@ def test_experiment_name_set_with_tensorboard_resource_does_not_raise(self): resource_config_wrapper=resource_config, ) - def test_experiment_name_set_with_graph_store_tensorboard_resource_does_not_raise(self): + def test_experiment_name_set_with_graph_store_tensorboard_resource_does_not_raise( + self, + ): """tensorboard_experiment_name set and graph-store TB resource present → no exception.""" gbml_config = _create_gbml_config_with_tensorboard_experiment_name( experiment_name="my-comparison" diff --git a/tests/unit/utils/tensorboard_writer_test.py b/tests/unit/utils/tensorboard_writer_test.py index 27bef083b..a71173549 100644 --- a/tests/unit/utils/tensorboard_writer_test.py +++ b/tests/unit/utils/tensorboard_writer_test.py @@ -152,22 +152,16 @@ def test_uploader_starts_when_all_env_vars_present(self) -> None: with patch( "gigl.utils.tensorboard_writer.tf.summary.create_file_writer" ) as mock_create_file_writer: - with patch( - "google.cloud.aiplatform.start_upload_tb_log" - ) as mock_start, patch( - "google.cloud.aiplatform.init" - ) as mock_init, patch( - "google.cloud.aiplatform.end_upload_tb_log" - ) as mock_end: + with ( + patch("google.cloud.aiplatform.start_upload_tb_log") as mock_start, + patch("google.cloud.aiplatform.init") as mock_init, + patch("google.cloud.aiplatform.end_upload_tb_log") as mock_end, + ): writer = TensorBoardWriter.from_env() writer.close() - mock_create_file_writer.assert_called_once_with( - f"{self._LOG_DIR}/my-run" - ) - mock_init.assert_called_once_with( - project="my-project", location="us-central1" - ) + mock_create_file_writer.assert_called_once_with(f"{self._LOG_DIR}/my-run") + mock_init.assert_called_once_with(project="my-project", location="us-central1") # Uploader watches the PARENT log dir so the run-name subdir is # discovered as a TensorboardRun via os.path.relpath. mock_start.assert_called_once_with( @@ -183,14 +177,11 @@ def test_uploader_does_not_start_when_only_log_dir_set(self) -> None: {"AIP_TENSORBOARD_LOG_DIR": self._LOG_DIR}, clear=True, ): - with patch( - "gigl.utils.tensorboard_writer.tf.summary.create_file_writer" - ): - with patch( - "google.cloud.aiplatform.start_upload_tb_log" - ) as mock_start, patch( - "google.cloud.aiplatform.end_upload_tb_log" - ) as mock_end: + with patch("gigl.utils.tensorboard_writer.tf.summary.create_file_writer"): + with ( + patch("google.cloud.aiplatform.start_upload_tb_log") as mock_start, + patch("google.cloud.aiplatform.end_upload_tb_log") as mock_end, + ): writer = TensorBoardWriter.from_env() writer.close() @@ -207,9 +198,7 @@ def test_invalid_tb_resource_name_raises(self) -> None: }, clear=True, ): - with patch( - "gigl.utils.tensorboard_writer.tf.summary.create_file_writer" - ): + with patch("gigl.utils.tensorboard_writer.tf.summary.create_file_writer"): with self.assertRaises(ValueError) as ctx: TensorBoardWriter.from_env() @@ -251,10 +240,13 @@ def test_uploader_failure_after_writer_construction_closes_writer(self) -> None: "gigl.utils.tensorboard_writer.tf.summary.create_file_writer", return_value=underlying_writer, ): - with patch( - "google.cloud.aiplatform.start_upload_tb_log", - side_effect=RuntimeError("boom"), - ), patch("google.cloud.aiplatform.init"): + with ( + patch( + "google.cloud.aiplatform.start_upload_tb_log", + side_effect=RuntimeError("boom"), + ), + patch("google.cloud.aiplatform.init"), + ): with self.assertRaises(RuntimeError): TensorBoardWriter.from_env() From dd88f7c318bba11f708813cd6ea6da3cc612b4f9 Mon Sep 17 00:00:00 2001 From: kmontemayor Date: Tue, 5 May 2026 21:39:49 +0000 Subject: [PATCH 43/59] dev: remove gigl/utils/dev/ smoke tooling The smoke launcher and tb_smoke_main were used to validate the multi-run TB design end-to-end during this branch's development. Now that R1 + R2 are validated and the production trainers carry the same behavior, drop the dev tooling so it doesn't ship in the public package. The plan at docs/plans/20260505-tb-multi-job-iteration.md retains a record of the iteration loop for posterity. --- gigl/utils/dev/__init__.py | 5 - gigl/utils/dev/submit_smoke_job.py | 314 ----------------------------- gigl/utils/dev/tb_smoke_main.py | 46 ----- 3 files changed, 365 deletions(-) delete mode 100644 gigl/utils/dev/__init__.py delete mode 100644 gigl/utils/dev/submit_smoke_job.py delete mode 100644 gigl/utils/dev/tb_smoke_main.py diff --git a/gigl/utils/dev/__init__.py b/gigl/utils/dev/__init__.py deleted file mode 100644 index 9c1bf25ab..000000000 --- a/gigl/utils/dev/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -"""Developer utilities (smoke entrypoints, ad-hoc test helpers). - -Modules under this package are intended for short, ad-hoc test jobs and -developer iteration. They are NOT part of GiGL's stable public API. -""" diff --git a/gigl/utils/dev/submit_smoke_job.py b/gigl/utils/dev/submit_smoke_job.py deleted file mode 100644 index 8aead9f3b..000000000 --- a/gigl/utils/dev/submit_smoke_job.py +++ /dev/null @@ -1,314 +0,0 @@ -"""Submit a tiny Vertex AI CustomJob that exercises GiGL's TensorBoard wiring. - -Goal: <2 min from "I changed launcher / writer code" to "I see whether TB -shows up." Bypasses ConfigPopulator and the full pipeline; uses the -production launcher path (``launch_single_pool_job``) so the same env-var -injection and submit logic runs as in real training. - -Required CLI flags: - --project GCP project (e.g. ``snap-umap-dev``). - --region Vertex AI region (e.g. ``us-central1``). - --service-account Service account email used by the CustomJob. - --staging-bucket Regional GCS bucket Vertex stages artifacts under. - --tensorboard Full TensorBoard resource name - (``projects/.../locations/.../tensorboards/...``). - --container-uri Container image to use. REQUIRED — must contain the - branch under test. Pointing at a released image - would test stale code; codex review explicitly - flagged defaulting to ``DEFAULT_GIGL_RELEASE_SRC_IMAGE_CPU`` - as wrong (round-2 issue 2). - -Optional: - --experiment-name Vertex AI ``TensorboardExperiment`` name. Leave - unset to test the per-job auto-upload path (R3). - Set to opt into multi-job comparison (R1+R2). - --job-name CustomJob display name. Defaults to a timestamped - ``gigl-tb-smoke-...``. - --dry-run Print the constructed VertexAiJobConfig and exit - without submitting. - -Verification: - On real (non-dry-run) submission, after the CustomJob completes the - script polls the TensorBoard API surfaces and asserts: - - - The per-job ``TensorboardExperiment`` (named after the CustomJob's - numeric ID) exists, has a run, and that run has at least one - ``TensorboardTimeSeries`` for the ``smoke/value`` tag. - - When ``--experiment-name`` was passed, the user-named experiment also - exists with a run named after the launch-unique ``GIGL_TENSORBOARD_RUN_NAME``, - and that run has at least one time series. - - Both TB UI URLs are printed for manual inspection. -""" - -from __future__ import annotations - -import argparse -import datetime -import re -import sys -import time -from typing import Optional - -from google.cloud import aiplatform - -from gigl.common import GcsUri, Uri -from gigl.common.logger import Logger -from gigl.src.common.constants.components import GiGLComponents -from gigl.src.common.types.pb_wrappers.gigl_resource_config import ( - GiglResourceConfigWrapper, -) -from gigl.src.common.vertex_ai_launcher import launch_single_pool_job -from snapchat.research.gbml import gigl_resource_config_pb2 - -logger = Logger() - -_TENSORBOARD_RESOURCE_NAME_PATTERN = re.compile( - r"^projects/(?P[^/]+)" - r"/locations/(?P[^/]+)" - r"/tensorboards/(?P[^/]+)$" -) - - -def _parse_args() -> argparse.Namespace: - parser = argparse.ArgumentParser(description=__doc__) - parser.add_argument("--project", required=True) - parser.add_argument("--region", required=True) - parser.add_argument("--service-account", required=True) - parser.add_argument( - "--staging-bucket", - required=True, - help="Regional GCS bucket (e.g. gs://gigl-dev-temp-assets).", - ) - parser.add_argument( - "--tensorboard", - required=True, - help="Full TensorBoard resource name.", - ) - parser.add_argument( - "--container-uri", - required=True, - help=( - "Container image with the branch code. Required; pointing at a " - "released image would test stale code." - ), - ) - parser.add_argument("--experiment-name", default=None) - parser.add_argument("--job-name", default=None) - parser.add_argument("--dry-run", action="store_true") - return parser.parse_args() - - -def _build_resource_config( - *, - project: str, - region: str, - service_account: str, - staging_bucket: str, - tensorboard_resource_name: str, -) -> gigl_resource_config_pb2.GiglResourceConfig: - """Minimal GiglResourceConfig wired for a 1-replica CPU CustomJob.""" - common = gigl_resource_config_pb2.SharedResourceConfig.CommonComputeConfig( - project=project, - region=region, - # The launcher reads ``temp_regional_assets_bucket`` as the Vertex - # AI staging bucket (see VertexAIService construction in - # gigl/src/common/vertex_ai_launcher.py). - temp_regional_assets_bucket=staging_bucket, - temp_assets_bucket=staging_bucket, - perm_assets_bucket=staging_bucket, - temp_assets_bq_dataset_name="not_used_by_smoke", - embedding_bq_dataset_name="not_used_by_smoke", - gcp_service_account_email=service_account, - dataflow_runner="DataflowRunner", - ) - shared = gigl_resource_config_pb2.SharedResourceConfig( - common_compute_config=common, - resource_labels={"cost_resource_group": "gigl_dev_smoke"}, - ) - trainer = gigl_resource_config_pb2.VertexAiResourceConfig( - # n1-standard-2 is rejected by Vertex AI training in this project; - # n1-standard-16 is the smallest spec we've confirmed accepted. - machine_type="n1-standard-16", - gpu_type="ACCELERATOR_TYPE_UNSPECIFIED", - gpu_limit=0, - num_replicas=1, - timeout=600, - tensorboard_resource_name=tensorboard_resource_name, - ) - return gigl_resource_config_pb2.GiglResourceConfig( - shared_resource_config=shared, - trainer_resource_config=gigl_resource_config_pb2.TrainerResourceConfig( - vertex_ai_trainer_config=trainer, - ), - ) - - -def _verify_per_job_experiment( - *, - tensorboard_resource_name: str, - job_id: str, -) -> None: - """The auto-uploader names its TensorboardExperiment after the job's numeric ID.""" - experiment_resource_name = f"{tensorboard_resource_name}/experiments/{job_id}" - runs = aiplatform.TensorboardRun.list( - tensorboard_experiment_name=experiment_resource_name, - ) - if not runs: - raise RuntimeError( - f"Per-job TensorboardExperiment {experiment_resource_name} has no " - "TensorboardRuns; the auto-uploader did not ingest any events." - ) - for run in runs: - time_series = aiplatform.TensorboardTimeSeries.list( - tensorboard_run_name=run.resource_name, - ) - if not time_series: - raise RuntimeError( - f"Run {run.resource_name} has no TensorboardTimeSeries; " - "events did not reach the API." - ) - logger.info( - f"Per-job experiment OK: {len(runs)} run(s) under {experiment_resource_name}" - ) - - -def _verify_named_experiment( - *, - tensorboard_resource_name: str, - experiment_name: str, -) -> None: - """The chief-rank uploader names its TensorboardExperiment after the user flag.""" - experiment_resource_name = ( - f"{tensorboard_resource_name}/experiments/{experiment_name}" - ) - runs = aiplatform.TensorboardRun.list( - tensorboard_experiment_name=experiment_resource_name, - ) - if not runs: - raise RuntimeError( - f"Named TensorboardExperiment {experiment_resource_name} has no " - "TensorboardRuns; the chief-rank uploader did not ingest events." - ) - for run in runs: - time_series = aiplatform.TensorboardTimeSeries.list( - tensorboard_run_name=run.resource_name, - ) - if not time_series: - raise RuntimeError( - f"Run {run.resource_name} has no TensorboardTimeSeries; " - "events did not reach the API." - ) - run_names = sorted(r.display_name for r in runs) - logger.info( - f"Named experiment OK: {len(runs)} run(s) under {experiment_resource_name}: " - f"{run_names}" - ) - - -def _print_tb_urls( - *, - region: str, - project: str, - tensorboard_id: str, - job_id: str, - experiment_name: Optional[str], -) -> None: - base = f"https://{region}.tensorboard.googleusercontent.com/experiment" - qualifier = f"projects+{project}+locations+{region}+tensorboards+{tensorboard_id}" - per_job = f"{base}/{qualifier}+experiments+{job_id}" - logger.info(f"Per-job TB URL: {per_job}") - if experiment_name: - named = f"{base}/{qualifier}+experiments+{experiment_name}" - logger.info(f"Named TB URL: {named}") - - -def main() -> int: - args = _parse_args() - - tb_match = _TENSORBOARD_RESOURCE_NAME_PATTERN.match(args.tensorboard) - if not tb_match: - logger.error( - f"--tensorboard must be projects/.../locations/.../tensorboards/...; " - f"got {args.tensorboard!r}." - ) - return 2 - - timestamp = datetime.datetime.utcnow().strftime("%Y%m%d-%H%M%S") - job_name = args.job_name or f"gigl-tb-smoke-{timestamp}" - tensorboard_logs_uri = GcsUri( - f"{args.staging_bucket.rstrip('/')}/tb-smoke/{timestamp}/logs/" - ) - - resource_config = _build_resource_config( - project=args.project, - region=args.region, - service_account=args.service_account, - staging_bucket=args.staging_bucket, - tensorboard_resource_name=args.tensorboard, - ) - resource_wrapper = GiglResourceConfigWrapper(resource_config=resource_config) - - if args.dry_run: - logger.info( - "Dry run — would submit a CustomJob with:\n" - f" job_name = {job_name}\n" - f" container_uri = {args.container_uri}\n" - f" tensorboard_logs_uri = {tensorboard_logs_uri}\n" - f" tensorboard_resource = {args.tensorboard}\n" - f" experiment_name = {args.experiment_name!r}\n" - ) - return 0 - - aiplatform.init(project=args.project, location=args.region) - custom_job = launch_single_pool_job( - vertex_ai_resource_config=resource_config.trainer_resource_config.vertex_ai_trainer_config, - job_name=job_name, - task_config_uri=Uri("gs://unused/by/smoke.yaml"), - resource_config_uri=Uri("gs://unused/by/smoke.yaml"), - process_command="python -m gigl.utils.dev.tb_smoke_main", - process_runtime_args={}, - resource_config_wrapper=resource_wrapper, - cpu_docker_uri=args.container_uri, - cuda_docker_uri=args.container_uri, - component=GiGLComponents.Trainer, - vertex_ai_region=args.region, - tensorboard_logs_uri=tensorboard_logs_uri, - tensorboard_experiment_name=args.experiment_name, - ) - job_id = custom_job.name # trailing segment of resource_name == numeric job ID - logger.info(f"Submitted CustomJob: {custom_job.resource_name}") - logger.info( - f"Job UI: https://console.cloud.google.com/ai/platform/locations/" - f"{args.region}/training/{job_id}?project={args.project}" - ) - - # CustomJob.submit blocks until completion in this code path (see - # VertexAIService._submit_job: job.wait_for_completion). Give the - # uploader thread a brief grace period in case the trainer's sleep - # was tight. - time.sleep(5) - - _verify_per_job_experiment( - tensorboard_resource_name=args.tensorboard, - job_id=job_id, - ) - if args.experiment_name: - _verify_named_experiment( - tensorboard_resource_name=args.tensorboard, - experiment_name=args.experiment_name, - ) - - _print_tb_urls( - region=args.region, - project=args.project, - tensorboard_id=tb_match["tensorboard_id"], - job_id=job_id, - experiment_name=args.experiment_name, - ) - - return 0 - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/gigl/utils/dev/tb_smoke_main.py b/gigl/utils/dev/tb_smoke_main.py deleted file mode 100644 index 400b1e0c2..000000000 --- a/gigl/utils/dev/tb_smoke_main.py +++ /dev/null @@ -1,46 +0,0 @@ -"""Tiny smoke-test entrypoint that exercises GiGL's TensorBoard pipeline. - -Submitted as the container command by ``tools/dev_submit_tb_smoke_job.py``. -On the chief rank, instantiates :class:`gigl.utils.tensorboard_writer.TensorBoardWriter` -via ``from_env``, writes a few scalar events, and sleeps long enough for both -TensorBoard uploaders (Vertex's built-in auto-uploader and our chief-rank -``aiplatform.start_upload_tb_log``) to flush before exit. - -Usage: - - python -m gigl.utils.dev.tb_smoke_main - -Reads no CLI flags. All configuration comes from env vars set by Vertex AI -and GiGL's launcher (``AIP_TENSORBOARD_LOG_DIR``, ``GIGL_TENSORBOARD_*``). -""" - -from __future__ import annotations - -import time - -from gigl.common.logger import Logger -from gigl.utils.tensorboard_writer import TensorBoardWriter - -logger = Logger() - -_NUM_STEPS = 3 -_FLUSH_SLEEP_SECS = 60 - - -def main() -> None: - """Write a handful of scalar events and wait for the uploaders to flush.""" - logger.info("Starting tb_smoke_main") - with TensorBoardWriter.from_env(enabled=True) as writer: - for step in range(_NUM_STEPS): - writer.log({"smoke/value": float(step)}, step=step) - logger.info(f"Wrote smoke/value={step} at step {step}") - logger.info( - f"Sleeping {_FLUSH_SLEEP_SECS}s to let TensorBoard uploaders flush " - "events to GCS + Vertex AI" - ) - time.sleep(_FLUSH_SLEEP_SECS) - logger.info("tb_smoke_main complete") - - -if __name__ == "__main__": - main() From e5d0cf9fc058ae05027578903f0bce591a748d96 Mon Sep 17 00:00:00 2001 From: kmontemayor Date: Tue, 5 May 2026 22:55:58 +0000 Subject: [PATCH 44/59] docs: remove in-flight branch plan from git The plan was load-bearing during dev; the final state of the branch is the source of truth now. Plan stays in /home/kmontemayor/.claude/plans/ for posterity. --- docs/plans/20260505-tb-multi-job-iteration.md | 362 ------------------ 1 file changed, 362 deletions(-) delete mode 100644 docs/plans/20260505-tb-multi-job-iteration.md diff --git a/docs/plans/20260505-tb-multi-job-iteration.md b/docs/plans/20260505-tb-multi-job-iteration.md deleted file mode 100644 index 0639252a0..000000000 --- a/docs/plans/20260505-tb-multi-job-iteration.md +++ /dev/null @@ -1,362 +0,0 @@ -# Multi-Job TensorBoard: Local Iteration & Final Design Plan - -Date: 2026-05-05 Branch: `kmonte/add-tb-for-glt` - -This plan supersedes the earlier branch plan at `docs/plans/20260504-tb-experiment-name-proto.md`. It incorporates -findings from two Codex plan reviews — round 1 at -`.claude/tmp/codex-verify/20260505-155740-plan-crystalline-giggling-backus/review.md` and round 2 at -`.claude/tmp/codex-verify/20260505-161326-plan-crystalline-giggling-backus/review.md`. Round-2 deltas (e.g. uniqueness -via timestamp suffix, returning the `CustomJob` from `launch_single_pool_job`, `--container-uri` required, no commit of -experiment name into the e2e CORA config) are applied during implementation, not via plan edits. - -## Context - -Across three full-pipeline iterations on this branch we've cycled through three TB integration designs, each broken in a -different way: - -1. **`submit(tensorboard=…)`** — auto-uploader runs, but the destination `TensorboardExperiment` is named after the - (numeric) `CustomJob` ID. Per-job page works (R1 ✓), but multiple jobs cannot share one TB page (R2 ✗). -2. **`submit(experiment=…)`** — never streams events. The SDK's `experiment=` is for Vertex AI Experiments - parameter/metric tracking; Vertex's TB auto-uploader is gated on `jobSpec.tensorboard` being set, which `experiment=` - is mutually exclusive with. Result: events written to `AIP_TENSORBOARD_LOG_DIR` sit in GCS un-uploaded. Job - 6570151780682825728 confirmed this empirically. -3. **Custom uploader from chief rank, no `tensorboard=`** — events stream to the chosen experiment (R2 ✓), but the VAI - job page no longer shows the "Open TensorBoard" link because that link is keyed on `jobSpec.tensorboard` (R1 ✗). Job - 4543918976459079680 confirmed this. - -R1 (TB link from job page) and R2 (multi-job comparison) are not mutually exclusive — they just can't be satisfied by a -single mechanism. The right approach combines both: server-side auto-uploader for the job-page link, plus a chief-rank -uploader for the cross-job comparison experiment, pointing at two different `TensorboardExperiment`s under the same -`Tensorboard` instance. Implementation is small; the risk is verifying behavior end-to-end. The fix for that is a tight -local iteration loop. - -## Success criteria - -| ID | Criterion | How verified | -| ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------- | -| R1 | The Vertex AI job UI shows "Open TensorBoard" for a successful job, and clicking it loads the per-job experiment with this job's scalar runs. | Manual: open the job in the cloud console; click the link. | -| R2 | Two jobs submitted with the same `tensorboardExperimentName` show **two distinct runs** on one TB page (the user-named experiment), each carrying its own scalars. | Manual: open the named experiment URL; toggle both runs in the scalars dashboard. Smoke script also asserts run count + ≥1 `TensorboardTimeSeries` per run. | -| R3 | Jobs without `tensorboardExperimentName` keep working: events flow to a per-job auto-named experiment. No regression. | Existing `tests/unit/src/common/vertex_ai_test.py::test_submit_job_passes_tensorboard_and_base_output_dir` plus a smoke run with the field unset. | -| R4 | `make unit_test_py` and `make type_check` pass on the branch. | CI / local. | -| R5 (process) | A new dev script lets us submit a tiny CustomJob from a laptop and verify R1+R2 in \<2 minutes, end-to-end. | Run it twice; time both invocations. | -| R6 | Trainer process exits cleanly even when training fails — the chief-rank uploader does not hang the worker. | Inspected via the `try/finally` (or `with`) wrapping in all four training entrypoints; `make unit_test_py` covers the writer's idempotent close. | - -## Final design - -**(A) Set `jobSpec.tensorboard=` on every job that has a TB resource configured (even when an experiment name -is also set).** This restores the VAI job-page TB link unconditionally and continues to populate -`AIP_TENSORBOARD_RESOURCE_NAME` and `AIP_TENSORBOARD_LOG_DIR` in the worker. Vertex's auto-uploader streams events to a -per-job experiment named after the job's numeric ID — that's R1. - -**(B) When `tensorboard_experiment_name` is set, the launcher injects three env vars:** - -- `GIGL_TENSORBOARD_RESOURCE_NAME` — full Tensorboard resource name (already injected at HEAD). -- `GIGL_TENSORBOARD_EXPERIMENT_NAME` — the user-chosen experiment name (already injected at HEAD). -- `GIGL_TENSORBOARD_RUN_NAME` — **new**: derived from the launcher's `job_name`, with `_` → `-` (so the GCS subdir name - matches what the SDK's `reformat_run_name` will produce). Codex Issue 1 fix. - -**(C) `TensorBoardWriter.from_env()` (chief rank only):** - -- If `GIGL_TENSORBOARD_RUN_NAME` is set: write events to `//` (a *subdirectory*), not - to the parent. This makes the run name visible to both the server-side auto-uploader and our chief-rank uploader as a - `relpath` of the parent logdir, instead of the SDK's hardcoded `DEFAULT_RUN_NAME = "default"` - (`.venv/lib/python3.11/site-packages/google/cloud/aiplatform/tensorboard/uploader_utils.py:44`). Two jobs with - different run names → two distinct runs in the named experiment. Codex Issue 1 fix. -- If `GIGL_TENSORBOARD_RUN_NAME` is unset: write to `AIP_TENSORBOARD_LOG_DIR` directly (today's behavior, R3 path). -- If both `GIGL_TENSORBOARD_RESOURCE_NAME` and `GIGL_TENSORBOARD_EXPERIMENT_NAME` are also set, additionally - `aiplatform.start_upload_tb_log(tensorboard_id=…, tensorboard_experiment_name=…, logdir=AIP_TENSORBOARD_LOG_DIR)` — - the parent logdir, not the subdir, so the uploader's `LogdirLoader` discovers the subdir as a run via - `os.path.relpath`. **Do not pass `run_name_prefix`** — the subdir already gives us the run identity, and a non-empty - prefix would concatenate awkwardly with the discovered run name. -- `close()` already pairs with `aiplatform.end_upload_tb_log()` (`gigl/utils/tensorboard_writer.py:149`). - -**(D) Always use `with TensorBoardWriter.from_env(...)` in trainer entrypoints.** The SDK uploader thread is **not** a -daemon (`.venv/lib/python3.11/site-packages/google/cloud/aiplatform/tensorboard/uploader_tracker.py:162` — -`threading.Thread(...).start()` without `daemon=True`); the SDK's docstring explicitly says to call -`end_upload_tb_log()` in `finally` (`uploader_tracker.py:109`). Today's example trainers call `close()` only on the -happy path. Codex Issue 3 fix: switch all four trainers to context-manager use. - -The `submit(experiment=…)` SDK path and the `_ensure_experiment_with_backing_tb` helper are not needed for either -requirement; both are gone as of HEAD `e19f1050`. - -## Files to modify - -- `gigl/common/services/vertex_ai.py` — `_submit_job`: drop the experiment-name early branch; always set - `tensorboard=` whenever `job_config.tensorboard_resource_name` is non-empty. Keep the experiment-name regex - validation (fail-fast). Update the `VertexAiJobConfig` docstring around `gigl/common/services/vertex_ai.py:150` (Codex - Issue 6). -- `gigl/src/common/vertex_ai_launcher.py` — `_build_job_config`: keep the existing `GIGL_TENSORBOARD_RESOURCE_NAME` / - `GIGL_TENSORBOARD_EXPERIMENT_NAME` injection; **add** `GIGL_TENSORBOARD_RUN_NAME` (sanitized job name). Update the - comment block at `gigl/src/common/vertex_ai_launcher.py:300` describing what `_submit_job` does (Codex Issue 6). -- `gigl/utils/tensorboard_writer.py` — `from_env()` reads `GIGL_TENSORBOARD_RUN_NAME` and uses it as a subdir of - `AIP_TENSORBOARD_LOG_DIR` for the `tf.summary.create_file_writer` log_dir; `_maybe_start_uploader` still watches the - parent logdir. -- `proto/snapchat/research/gbml/gbml_config.proto:204` — update the `tensorboard_experiment_name` comment to describe - the dual-uploader behavior, not the dropped `experiment=`-backed design (Codex Issue 6). Run `make compile_protos` to - regenerate Python + Scala stubs. -- `examples/link_prediction/configs/e2e_hom_cora_sup_task_config.yaml:26` — change `tensorboardExperimentName` from the - personal `kmonte-test-experiment` to `homogeneous-link-prediction-comparison` (Codex Issue 5). -- `examples/link_prediction/homogeneous_training.py`, `examples/link_prediction/heterogeneous_training.py`, - `examples/link_prediction/graph_store/homogeneous_training.py`, - `examples/link_prediction/graph_store/heterogeneous_training.py` — replace the existing - `tensorboard_writer = TensorBoardWriter.from_env(...)` + later `.close()` pattern with a `with` block. (Codex Issue 3 - \+ Impact Analysis.) -- `tests/unit/src/common/vertex_ai_test.py` — rename - `test_submit_job_skips_experiment_and_tensorboard_when_experiment_name_set` to - `test_submit_job_passes_tensorboard_with_or_without_experiment_name` and assert `tensorboard=` is set in both - branches. -- `tests/unit/src/common/vertex_ai_launcher_test.py` — assert `GIGL_TENSORBOARD_RUN_NAME` is injected when an experiment - name is set; not injected otherwise. -- `tests/unit/utils/tensorboard_writer_test.py` — assert the writer's effective `log_dir` is the subdir - (`//`) when `GIGL_TENSORBOARD_RUN_NAME` is set; assert `start_upload_tb_log` is called with - `logdir=` (NOT the subdir) and no `run_name_prefix`. -- `python -m gigl.utils.dev.submit_smoke_job` — **new** local iteration tool. The `tools/` directory already exists in - the repo (Codex correction). - -## Local iteration tool - -A standalone Python script that bypasses ConfigPopulator and the full pipeline. Goal: \<2 min from "I changed code" to -"I see whether TB shows up." - -Path: `python -m gigl.utils.dev.submit_smoke_job`. - -What it does: - -1. **Use the production launcher path** (`gigl.src.common.vertex_ai_launcher.launch_single_pool_job`) — *not* - `VertexAIService.launch_job` directly — so the same `_build_job_config` env-var injection runs as in production. - Codex Issue 2 fix. -2. Constructs a small `VertexAiResourceConfig` proto inline: - - `machine_type="n1-standard-2"`, `gpu_type="ACCELERATOR_TYPE_UNSPECIFIED"`, `gpu_limit=0`, `num_replicas=1`, - `tensorboard_resource_name=`. -3. Constructs a small `GiglResourceConfig` proto with that trainer config + - `shared_resource_config.common_compute_config` populated from CLI flags. -4. Calls `launch_single_pool_job(...)` with: - - `process_command="python -m gigl.utils.dev.tb_smoke_main"` — a tiny module added in the same commit; reads env - vars, instantiates `TensorBoardWriter.from_env(enabled=True)`, writes 3 scalar events at steps 0/1/2, sleeps ~30s, - exits. - - `tensorboard_logs_uri = GcsUri("gs:///tb-smoke//logs/")` — drives `base_output_dir` via the - existing helper at `gigl/src/common/vertex_ai_launcher.py:_get_base_output_dir_from_tensorboard_logs_uri`. - - `tensorboard_experiment_name` from a CLI flag (or `None`). -5. After completion, queries the Vertex AI APIs: - - `aiplatform.TensorboardExperiment.list(tensorboard_name=)` (`tensorboard_resource.py:518`) to confirm - both expected experiments exist (the per-job auto-named one always; the user-named one only when the - experiment-name flag was passed). - - For each expected run, `aiplatform.TensorboardTimeSeries.list(tensorboard_run_name=)` - (`tensorboard_resource.py:1264`) to confirm at least one scalar tag exists. Codex Issue 4 fix — - `TensorboardRun.list` alone only confirms run *existence*, not that scalars were ingested. -6. Prints both TB UI URLs (per-job and named) for manual inspection. - -Required CLI flags: `--project`, `--region`, `--service-account`, `--staging-bucket`, `--tensorboard` (full resource -name), and optional `--experiment-name`, `--container-uri` (defaults to `DEFAULT_GIGL_RELEASE_SRC_IMAGE_CPU` from -`gigl/common/constants.py:69`), `--dry-run`. - -Existing infrastructure to reuse: - -- `gigl/src/common/vertex_ai_launcher.py:launch_single_pool_job` — production entry; running through this exercises - env-var injection. -- `gigl/common/services/vertex_ai.py:VertexAiJobConfig` — config dataclass. -- `gigl/utils/tensorboard_writer.py:TensorBoardWriter` — same writer the trainers use. -- `aiplatform.TensorboardExperiment.list` / `aiplatform.TensorboardRun.list` / `aiplatform.TensorboardTimeSeries.list` — - verification surfaces. -- `DEFAULT_GIGL_RELEASE_SRC_IMAGE_CPU` from `gigl/common/constants.py:69` — default container image. - -## Step-by-step plan - -Each step ends with a verification. - -### Step 1: revert `_submit_job` to always pass `tensorboard=` and refresh stale comments - -Production code: - -- `gigl/common/services/vertex_ai.py:_submit_job` — set - `submit_kwargs["tensorboard"] = job_config.tensorboard_resource_name` whenever `job_config.tensorboard_resource_name` - is non-empty, regardless of `tensorboard_experiment_name`. Keep the experiment-name regex validation gate. -- `gigl/common/services/vertex_ai.py:150` — update the `VertexAiJobConfig.tensorboard_experiment_name` docstring to - describe "auxiliary chief-rank uploader streams events to this experiment in addition to the per-job auto-named one." -- `gigl/src/common/vertex_ai_launcher.py:300` — update the comment block describing `_submit_job` behavior. -- `proto/snapchat/research/gbml/gbml_config.proto:204` — replace the `experiment=`-backed description with the new - dual-uploader description; run `make compile_protos`. - -Tests: - -- `tests/unit/src/common/vertex_ai_test.py` — rename - `test_submit_job_skips_experiment_and_tensorboard_when_experiment_name_set` → - `test_submit_job_passes_tensorboard_with_or_without_experiment_name`; assert `tensorboard=` is set in both branches. - -Verify: `make unit_test_py PY_TEST_FILES="vertex_ai_test.py"` passes; `make type_check` is clean. - -Commit: `vertex_ai: always pass tensorboard= so VAI job page links to TB`. - -### Step 2: inject `GIGL_TENSORBOARD_RUN_NAME` and consume it in the writer - -Production code: - -- `gigl/src/common/vertex_ai_launcher.py:_build_job_config` — when `tensorboard_experiment_name` is set, also append - `env_var.EnvVar(name="GIGL_TENSORBOARD_RUN_NAME", value=job_name.replace("_", "-"))` next to the existing two - GIGL_TENSORBOARD\_\* env vars. (We pre-sanitize so the GCS subdir name and the SDK-derived run name agree.) -- `gigl/utils/tensorboard_writer.py:from_env` — if `GIGL_TENSORBOARD_RUN_NAME` is set, compute - `effective_log_dir = os.path.join(AIP_TENSORBOARD_LOG_DIR, run_name)` and pass that to - `tf.summary.create_file_writer`. Otherwise pass `AIP_TENSORBOARD_LOG_DIR` (today's behavior). -- `gigl/utils/tensorboard_writer.py:_maybe_start_uploader` — keep watching the **parent** `AIP_TENSORBOARD_LOG_DIR` (so - the SDK's `LogdirLoader` discovers the run via `os.path.relpath(subdir, logdir)` as the subdir name). No - `run_name_prefix`. - -Tests: - -- `tests/unit/src/common/vertex_ai_launcher_test.py` — assert the GIGL_TENSORBOARD_RUN_NAME env var is injected when an - experiment name is set; underscores in the job name become hyphens; not injected when experiment name is unset. -- `tests/unit/utils/tensorboard_writer_test.py` — when `GIGL_TENSORBOARD_RUN_NAME=my-run`: assert the writer's - underlying file-writer was created for `/my-run/`; assert `start_upload_tb_log` called with `logdir=` - and no `run_name_prefix`. When unset: writer uses parent dir directly (regression coverage for R3). - -Verify: `make unit_test_py PY_TEST_FILES="vertex_ai_launcher_test.py"`; -`make unit_test_py PY_TEST_FILES="tensorboard_writer_test.py"`. - -Commit: `tensorboard: emit unique run names so multi-job comparison shows two runs`. - -### Step 3: harden trainer uploader lifecycle - -For each of: - -- `examples/link_prediction/homogeneous_training.py` (`tensorboard_writer = TensorBoardWriter.from_env(...)` at line - 364, `.close()` at line 621) -- `examples/link_prediction/heterogeneous_training.py` -- `examples/link_prediction/graph_store/homogeneous_training.py` -- `examples/link_prediction/graph_store/heterogeneous_training.py` - -Replace the assignment + later `.close()` pattern with -`with TensorBoardWriter.from_env(enabled=is_chief_process) as tensorboard_writer:` wrapping the body. The writer already -supports `__enter__`/`__exit__`; this just guarantees `end_upload_tb_log` runs even when training raises. - -If the writer is used at module scope across many functions (and a single `with` block would force a large diff), wrap -the function that owns the training loop in `try/finally` and call `tensorboard_writer.close()` in `finally`. - -Tests: existing `make unit_test_py PY_TEST_FILES="tensorboard_writer_test.py"` already covers idempotent close. No new -unit tests required (these example scripts are not unit-tested today). - -Verify: `make type_check`; manually re-read each modified entrypoint to confirm the writer's lifetime spans the entire -training-loop scope. - -Commit: `examples: scope TensorBoardWriter to a context manager in all training entrypoints`. - -### Step 4: write `python -m gigl.utils.dev.submit_smoke_job` + `gigl/utils/dev/tb_smoke_main.py` - -- `gigl/utils/dev/tb_smoke_main.py`: new module. ~25 lines. Uses `TensorBoardWriter.from_env(enabled=True)` to write 3 - scalar events (`{"smoke/value": float(step)}` at steps 0, 1, 2) inside a `with` block, then `time.sleep(30)` to let - both uploaders flush. Module-level entry so it can be invoked with `python -m gigl.utils.dev.tb_smoke_main`. -- `python -m gigl.utils.dev.submit_smoke_job`: new top-level script. - - argparse for `--project`, `--region`, `--service-account`, `--staging-bucket`, `--tensorboard`, optional - `--experiment-name`, `--container-uri`, `--dry-run`. - - Builds `VertexAiResourceConfig` and `GiglResourceConfig` protos inline (mirror the patterns in - `tests/unit/src/common/vertex_ai_launcher_test.py:_create_gigl_resource_config_with_single_pool_inference` for - shape). - - Calls - `launch_single_pool_job(... vertex_ai_region=, tensorboard_logs_uri=GcsUri("gs:///tb-smoke//logs/"), tensorboard_experiment_name=)`. - - On `--dry-run`: print the resulting `VertexAiJobConfig` and exit 0. - - On real run: wait via `service.launch_job` (synchronous), then poll the verification APIs: - - `aiplatform.TensorboardExperiment.list(tensorboard_name=)` — assert per-job experiment with the job's - numeric ID exists; assert user-experiment exists iff flag passed. - - For each expected experiment: `aiplatform.TensorboardRun.list(tensorboard_experiment_name=)` — - assert at least one run, and (for `--experiment-name` mode) that the run name matches the sanitized job name. - - For each expected run: `aiplatform.TensorboardTimeSeries.list(tensorboard_run_name=)` — assert - at least one time series with at least one tag (Codex Issue 4 fix). - - Print both UI URLs. - -Verify (offline): -`python python -m gigl.utils.dev.submit_smoke_job --dry-run --project=… --region=… --service-account=… --staging-bucket=gs://… --tensorboard=projects/…/tensorboards/… --experiment-name=tb-smoke-multi` -prints the `VertexAiJobConfig` and exits 0 without touching GCP. - -Commit: `tools: add dev_submit_tb_smoke_job + tb_smoke_main for fast TB iteration`. - -### Step 5: smoke-validate R1 + R3 (no experiment name) - -Run the smoke script without `--experiment-name`. After completion (≤2 min): - -- The Vertex AI job UI for the run shows "Open TensorBoard"; clicking it loads the per-job experiment (R1). -- The per-job experiment exists with one run named `default` (R3 — no `GIGL_TENSORBOARD_RUN_NAME` injected, the writer - falls back to writing to the parent logdir). -- No experiment with the user-named slug exists. - -If R3 fails, suspect Step 1's submit-kwargs change. The smoke loop iteration is the diagnostic surface. - -### Step 6: smoke-validate R1 + R2 (with experiment name) - -Run twice with the same flag: `--experiment-name=tb-smoke-multi`. After both complete: - -- Both job pages still show working "Open TensorBoard" links (R1). -- Two per-job experiments exist (one per job, auto-named). -- The `tb-smoke-multi` experiment exists with **two runs**, named after each sanitized job name. -- Each of those runs has at least one `TensorboardTimeSeries` for the `smoke/value` tag. - -If R2 fails (e.g., one merged run instead of two), suspect Step 2's run-name plumbing — iterate within the smoke loop, -not the full pipeline. - -### Step 7: full-pipeline regression test - -With R1 + R2 verified at the smoke layer, kick off one real homogeneous training run with -`tensorboardExperimentName: "homogeneous-link-prediction-comparison"` (the value updated in Step 1's config edit, Codex -Issue 5). Verify: - -- "Open TensorBoard" link works on the job page (R1). -- The named experiment shows the run with all trainer scalar tags (R2). - -### Step 8: shipping checklist - -- `make unit_test_py` and `make type_check` clean. -- The original branch plan's Task 11 manual smoke test gate is now satisfied by Steps 5–7. -- `make format`. -- Optionally request final code review on the post-step-1 diff via `superpowers:code-reviewer`. -- Open the PR. - -### Step 0 (close-out, runs after exit-plan-mode): relocate this plan to `docs/plans/` - -`mv /home/kmontemayor/.claude/plans/crystalline-giggling-backus.md docs/plans/20260505-tb-multi-job-iteration.md` — and -add a note in the new file's header pointing at the supersedence relationship with -`docs/plans/20260504-tb-experiment-name-proto.md`. Per CLAUDE.md plan conventions (`CLAUDE.md:252`, Codex Issue 7). - -## Verification summary - -| Step | Type | Cost | What it proves | -| ---- | ------------------------------------- | --------- | -------------------------------------------------------------------------- | -| 1, 2 | Unit tests + `type_check` | seconds | Code paths aren't broken; env-var injection + writer subdir wiring correct | -| 3 | Read-through + `type_check` | seconds | Lifecycle hardening compiles | -| 4 | `--dry-run` of smoke script | seconds | Script wires correctly without submitting | -| 5 | One smoke run (no experiment-name) | ~1–2 min | R1 + R3 | -| 6 | Two smoke runs (same experiment-name) | ~3–4 min | R1 + R2 (run identity, scalar ingestion) | -| 7 | One real homogeneous training run | ~5–15 min | Full pipeline + R1 + R2 | - -Total budget for design-and-verify: ~30 minutes of cluster time. - -## Risks & open questions - -- **The chief-rank uploader thread is not a daemon** (`uploader_tracker.py:162`). Process exit will not reap it; - `end_upload_tb_log()` MUST be called. Step 3 enforces this via `with` blocks in all four trainer entrypoints. Codex - Issue 3 fix — the original plan's claim that "the SDK's uploader thread is daemon" was wrong. -- **Race between two uploaders on the same logdir.** Both uploaders read events from GCS; neither writes. Each maintains - its own `LogdirLoader` state. No conflict observed in the SDK source. Step 5 + 6 confirm in practice. -- **Quota.** Two uploaders ≈ 2× ingestion request rate per opt-in job. Acceptable; revisit only on 429s. -- **GCS subdir vs logdir parent.** The chief-rank uploader watches `AIP_TENSORBOARD_LOG_DIR` (parent) and discovers the - run as the subdir name. The server-side auto-uploader does the same. If we ever switch to writing events directly at - the parent (no subdir), R2 collapses back to a single `default` run. Step 2's tests pin both ends. -- **`make compile_protos` regenerates Scala stubs as well.** The proto comment update in Step 1 will create a noisy diff - in `scala/...` and `scala_spark35/...`. Acceptable. - -## Roll-back - -If Steps 5 or 6 fail and the chief-rank uploader is the cause, set just `tensorboard=` on submit and stop -injecting any `GIGL_TENSORBOARD_*` env vars. Falls back to R1-only (per-job TB), losing R2 — back to the state before -this branch, with no regression. - -## Codex review traceability - -Issues 1–7 from `.claude/tmp/codex-verify/20260505-155740-plan-crystalline-giggling-backus/review.md`: - -| Issue | Severity | Addressed in | -| --------------------------------------- | -------- | ----------------------------------------------------------------------------------------------- | -| 1 — Run identity collapse | High | Step 2 (subdir-based run names, no `run_name_prefix`) | -| 2 — Smoke script bypasses env injection | High | Step 4 (smoke script uses `launch_single_pool_job`) | -| 3 — Uploader thread not daemon | High | Step 3 (`with` wrapping in all four trainers) | -| 4 — TimeSeries verification | Medium | Step 4 (smoke script asserts `TensorboardTimeSeries.list`) | -| 5 — Wrong experiment-name in Step 5 | Medium | Step 1 (config update from `kmonte-test-experiment` → `homogeneous-link-prediction-comparison`) | -| 6 — Stale comments / proto doc | Low | Step 1 (vertex_ai.py:150, vertex_ai_launcher.py:300, gbml_config.proto:204) | -| 7 — Plan-file location convention | Low | Step 0 (move to `docs/plans/20260505-tb-multi-job-iteration.md`) | From 36798ed67e55d36277cb7464baab46f1915456b5 Mon Sep 17 00:00:00 2001 From: kmontemayor Date: Tue, 5 May 2026 22:58:21 +0000 Subject: [PATCH 45/59] vertex_ai: collapse submit_kwargs dict back to direct kwargs Now that we always pass tensorboard= when set (and just None otherwise), the conditional dict is unnecessary. Reverts the dict + conditional add added in 4868e04e to a single direct submit() call. Drops the now-unused Any import. --- gigl/common/services/vertex_ai.py | 33 +++++++++++++++---------------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/gigl/common/services/vertex_ai.py b/gigl/common/services/vertex_ai.py index 5f9c6ba56..cd119b321 100644 --- a/gigl/common/services/vertex_ai.py +++ b/gigl/common/services/vertex_ai.py @@ -63,7 +63,7 @@ def get_pipeline() -> int: # NOTE: `get_pipeline` here is the Pipeline name import re import time from dataclasses import dataclass -from typing import Any, Final, Optional, Union +from typing import Final, Optional, Union from google.cloud import aiplatform from google.cloud.aiplatform_v1.types import ( @@ -406,12 +406,6 @@ def _submit_job( staging_bucket=self._staging_bucket, base_output_dir=job_config.base_output_dir, ) - submit_kwargs: dict[str, Any] = dict( - service_account=self._service_account, - timeout=job_config.timeout_s, - enable_web_access=job_config.enable_web_access, - scheduling_strategy=job_config.scheduling_strategy, - ) if job_config.tensorboard_experiment_name: if not job_config.tensorboard_resource_name: raise ValueError( @@ -426,16 +420,21 @@ def _submit_job( f"is not a valid Vertex AI Experiment ID; it must match " f"{_VERTEX_RESOURCE_ID_PATTERN.pattern}." ) - if job_config.tensorboard_resource_name: - # Always pass ``tensorboard=`` whenever a TB resource is - # configured, so the Vertex AI job page shows an "Open TensorBoard" - # link to the auto-named per-job experiment. When - # ``tensorboard_experiment_name`` is also set, the launcher has - # injected ``GIGL_TENSORBOARD_*`` env vars and the trainer's chief - # rank additionally streams events to the user-named experiment - # via ``aiplatform.start_upload_tb_log``. - submit_kwargs["tensorboard"] = job_config.tensorboard_resource_name - job.submit(**submit_kwargs) + + # Always pass ``tensorboard=`` when a TB resource is + # configured so the Vertex AI job page shows an "Open TensorBoard" + # link to the auto-named per-job experiment. When + # ``tensorboard_experiment_name`` is also set, the launcher injects + # ``GIGL_TENSORBOARD_*`` env vars and the trainer's chief rank + # additionally streams events to the user-named experiment via + # ``aiplatform.start_upload_tb_log``. + job.submit( + service_account=self._service_account, + timeout=job_config.timeout_s, + enable_web_access=job_config.enable_web_access, + scheduling_strategy=job_config.scheduling_strategy, + tensorboard=job_config.tensorboard_resource_name or None, + ) job.wait_for_resource_creation() logger.info(f"Created job: {job.resource_name}") # Copying https://github.com/googleapis/python-aiplatform/blob/v1.48.0/google/cloud/aiplatform/jobs.py#L207-L215 From 24e66be958410b725344cf461dc05e2e36127b70 Mon Sep 17 00:00:00 2001 From: kmontemayor Date: Tue, 5 May 2026 22:58:48 +0000 Subject: [PATCH 46/59] =?UTF-8?q?v1:=20revert=20all=20v1=20trainer=20chang?= =?UTF-8?q?es=20=E2=80=94=20out=20of=20scope=20for=20this=20PR?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The branch's v1/trainer.py refactor (inline launch → launch_single_pool_job) and v1/lib/training_process.py tensorboard-writer removal land separately. Drops gigl/src/training/v1/trainer.py + .../training_process.py back to main; deletes the v1 trainer unit-test file. After this commit: `git diff main...HEAD -- gigl/src/training/v1/` is empty. --- gigl/src/training/v1/lib/training_process.py | 27 +++--- gigl/src/training/v1/trainer.py | 67 +++++++------- tests/unit/src/training/v1_trainer_test.py | 95 -------------------- 3 files changed, 53 insertions(+), 136 deletions(-) delete mode 100644 tests/unit/src/training/v1_trainer_test.py diff --git a/gigl/src/training/v1/lib/training_process.py b/gigl/src/training/v1/lib/training_process.py index 31894a7c3..9d8e8f21b 100644 --- a/gigl/src/training/v1/lib/training_process.py +++ b/gigl/src/training/v1/lib/training_process.py @@ -8,6 +8,7 @@ from distutils.util import strtobool from typing import Any, Optional +import tensorflow as tf import torch import torch.distributed import torch.nn.parallel @@ -215,20 +216,26 @@ def __run_training( ): trainer_instance.setup_for_training() logger.info(f"Starting training at {current_formatted_datetime()}") + tensorboard_log_uri = gbml_config_pb_wrapper.shared_config.trained_model_metadata.tensorboard_logs_uri profiler = get_torch_profiler_instance( gbml_config_pb_wrapper=gbml_config_pb_wrapper ) - with ( - profiler.profiler_context() # type: ignore[attr-defined] - if profiler - else contextlib.nullcontext() as prof - ): - trainer_instance.train( - gbml_config_pb_wrapper=gbml_config_pb_wrapper, - device=device, - profiler=prof, - ) + file_writer = None + if gbml_config_pb_wrapper.trainer_config.should_log_to_tensorboard: + file_writer = tf.summary.create_file_writer(tensorboard_log_uri) + + with file_writer.as_default() if file_writer else contextlib.nullcontext(): + with ( + profiler.profiler_context() # type: ignore[attr-defined] + if profiler + else contextlib.nullcontext() as prof + ): + trainer_instance.train( + gbml_config_pb_wrapper=gbml_config_pb_wrapper, + device=device, + profiler=prof, + ) if profiler: if does_path_exist(TMP_PROFILER_LOG_DIR_NAME): file_loader = FileLoader() diff --git a/gigl/src/training/v1/trainer.py b/gigl/src/training/v1/trainer.py index 9d3655e98..c1509ea54 100644 --- a/gigl/src/training/v1/trainer.py +++ b/gigl/src/training/v1/trainer.py @@ -2,7 +2,7 @@ from typing import Optional import torch -from google.cloud.aiplatform_v1.types import accelerator_type +from google.cloud.aiplatform_v1.types import accelerator_type, env_var from gigl.common import Uri, UriFactory from gigl.common.constants import ( @@ -10,12 +10,11 @@ DEFAULT_GIGL_RELEASE_SRC_IMAGE_CUDA, ) from gigl.common.logger import Logger +from gigl.common.services.vertex_ai import VertexAiJobConfig, VertexAIService from gigl.env.pipelines_config import get_resource_config from gigl.src.common.constants.components import GiGLComponents from gigl.src.common.types import AppliedTaskIdentifier -from gigl.src.common.types.pb_wrappers.gbml_config import GbmlConfigPbWrapper from gigl.src.common.utils.metrics_service_provider import initialize_metrics -from gigl.src.common.vertex_ai_launcher import launch_single_pool_job from gigl.src.training.v1.lib.training_process import GnnTrainingProcess from snapchat.research.gbml.gigl_resource_config_pb2 import ( LocalResourceConfig, @@ -44,37 +43,43 @@ def run( is_cpu_training = self._determine_if_cpu_training(trainer_config) if isinstance(trainer_config, VertexAiResourceConfig): - gbml_config_pb_wrapper = ( - GbmlConfigPbWrapper.get_gbml_config_pb_wrapper_from_uri( - gbml_config_uri=task_config_uri - ) - ) - raw_tensorboard_logs_uri = gbml_config_pb_wrapper.shared_config.trained_model_metadata.tensorboard_logs_uri - tensorboard_logs_uri = ( - UriFactory.create_uri(raw_tensorboard_logs_uri) - if raw_tensorboard_logs_uri - else None - ) - tensorboard_experiment_name = ( - gbml_config_pb_wrapper.trainer_config.tensorboard_experiment_name - or None + cpu_docker_uri = cpu_docker_uri or DEFAULT_GIGL_RELEASE_SRC_IMAGE_CPU + cuda_docker_uri = cuda_docker_uri or DEFAULT_GIGL_RELEASE_SRC_IMAGE_CUDA + container_uri = cpu_docker_uri if is_cpu_training else cuda_docker_uri + environment_variables: list[env_var.EnvVar] = [ + env_var.EnvVar(name="TF_CPP_MIN_LOG_LEVEL", value="3"), + ] + job_args = [ + f"--job_name={applied_task_identifier}", + f"--task_config_uri={task_config_uri}", + f"--resource_config_uri={resource_config_uri}", + ] + ([] if is_cpu_training else ["--use_cuda"]) + + job_config = VertexAiJobConfig( + job_name=applied_task_identifier, + container_uri=container_uri, + command=["python", "-m", "gigl.src.training.v1.lib.training_process"], + args=job_args, + environment_variables=environment_variables, + machine_type=trainer_config.machine_type, + accelerator_type=trainer_config.gpu_type.upper().replace("-", "_"), + accelerator_count=trainer_config.gpu_limit, + replica_count=trainer_config.num_replicas, + labels=resource_config.get_resource_labels( + component=GiGLComponents.Trainer + ), + timeout_s=trainer_config.timeout if trainer_config.timeout else None, ) - launch_single_pool_job( - vertex_ai_resource_config=trainer_config, - job_name=str(applied_task_identifier), - task_config_uri=task_config_uri, - resource_config_uri=resource_config_uri, - process_command="python -m gigl.src.training.v1.lib.training_process", - process_runtime_args={}, - resource_config_wrapper=resource_config, - cpu_docker_uri=cpu_docker_uri or DEFAULT_GIGL_RELEASE_SRC_IMAGE_CPU, - cuda_docker_uri=cuda_docker_uri or DEFAULT_GIGL_RELEASE_SRC_IMAGE_CUDA, - component=GiGLComponents.Trainer, - vertex_ai_region=resource_config.vertex_ai_trainer_region, - tensorboard_logs_uri=tensorboard_logs_uri, - tensorboard_experiment_name=tensorboard_experiment_name, + + vertex_ai_service = VertexAIService( + project=resource_config.project, + location=resource_config.region, + service_account=resource_config.service_account_email, + staging_bucket=resource_config.temp_assets_regional_bucket_path.uri, ) + vertex_ai_service.launch_job(job_config=job_config) + elif isinstance(trainer_config, LocalResourceConfig): training_process = GnnTrainingProcess() training_process.run( diff --git a/tests/unit/src/training/v1_trainer_test.py b/tests/unit/src/training/v1_trainer_test.py deleted file mode 100644 index 70d3adb95..000000000 --- a/tests/unit/src/training/v1_trainer_test.py +++ /dev/null @@ -1,95 +0,0 @@ -"""Unit tests for v1 Trainer — verifies tensorboard_experiment_name forwarding.""" - -from unittest.mock import MagicMock, patch - -from gigl.common import UriFactory -from gigl.src.common.types import AppliedTaskIdentifier -from gigl.src.training.v1.trainer import Trainer -from snapchat.research.gbml import gbml_config_pb2, gigl_resource_config_pb2 -from tests.test_assets.test_case import TestCase - - -def _make_resource_config_wrapper_with_single_pool() -> MagicMock: - """Return a GiglResourceConfigWrapper mock backed by a VertexAiResourceConfig.""" - vertex_ai_config = gigl_resource_config_pb2.VertexAiResourceConfig( - machine_type="n1-standard-8", - num_replicas=1, - timeout=7200, - ) - mock_wrapper = MagicMock() - mock_wrapper.trainer_config = vertex_ai_config - mock_wrapper.vertex_ai_trainer_region = "us-central1" - return mock_wrapper - - -def _make_gbml_config_pb_wrapper(experiment_name: str = "my-comparison") -> MagicMock: - """Return a GbmlConfigPbWrapper mock with tensorboard_experiment_name set.""" - trainer_config_proto = gbml_config_pb2.GbmlConfig.TrainerConfig( - tensorboard_experiment_name=experiment_name, - ) - - mock_wrapper = MagicMock() - mock_wrapper.trainer_config = trainer_config_proto - # Ensure tensorboard_logs_uri is empty so UriFactory is not called. - mock_wrapper.shared_config.trained_model_metadata.tensorboard_logs_uri = "" - return mock_wrapper - - -class TestV1TrainerExperimentNameForwarding(TestCase): - """Tests that v1 Trainer forwards tensorboard_experiment_name to the launcher.""" - - @patch("gigl.src.training.v1.trainer.launch_single_pool_job") - @patch("gigl.src.training.v1.trainer.GbmlConfigPbWrapper") - @patch("gigl.src.training.v1.trainer.get_resource_config") - def test_single_pool_forwards_experiment_name( - self, - mock_get_resource_config, - mock_gbml_config_cls, - mock_launch_single_pool_job, - ) -> None: - """launch_single_pool_job receives tensorboard_experiment_name='my-comparison'.""" - mock_get_resource_config.return_value = ( - _make_resource_config_wrapper_with_single_pool() - ) - mock_gbml_config_cls.get_gbml_config_pb_wrapper_from_uri.return_value = ( - _make_gbml_config_pb_wrapper("my-comparison") - ) - - trainer = Trainer() - trainer.run( - applied_task_identifier=AppliedTaskIdentifier("test-job"), - task_config_uri=UriFactory.create_uri("gs://bucket/task.yaml"), - resource_config_uri=UriFactory.create_uri("gs://bucket/resource.yaml"), - ) - - mock_launch_single_pool_job.assert_called_once() - call_kwargs = mock_launch_single_pool_job.call_args.kwargs - self.assertEqual(call_kwargs["tensorboard_experiment_name"], "my-comparison") - - @patch("gigl.src.training.v1.trainer.launch_single_pool_job") - @patch("gigl.src.training.v1.trainer.GbmlConfigPbWrapper") - @patch("gigl.src.training.v1.trainer.get_resource_config") - def test_single_pool_empty_experiment_name_becomes_none( - self, - mock_get_resource_config, - mock_gbml_config_cls, - mock_launch_single_pool_job, - ) -> None: - """Empty string tensorboard_experiment_name is coerced to None.""" - mock_get_resource_config.return_value = ( - _make_resource_config_wrapper_with_single_pool() - ) - mock_gbml_config_cls.get_gbml_config_pb_wrapper_from_uri.return_value = ( - _make_gbml_config_pb_wrapper("") # proto default empty string - ) - - trainer = Trainer() - trainer.run( - applied_task_identifier=AppliedTaskIdentifier("test-job"), - task_config_uri=UriFactory.create_uri("gs://bucket/task.yaml"), - resource_config_uri=UriFactory.create_uri("gs://bucket/resource.yaml"), - ) - - mock_launch_single_pool_job.assert_called_once() - call_kwargs = mock_launch_single_pool_job.call_args.kwargs - self.assertIsNone(call_kwargs["tensorboard_experiment_name"]) From dd557448cf9d63f009d8a0619168025fa4b33b1c Mon Sep 17 00:00:00 2001 From: kmontemayor Date: Tue, 5 May 2026 23:07:13 +0000 Subject: [PATCH 47/59] docs: add Vertex AI doc-link references to TB code paths Cross-references in source comments and proto comments so readers can jump to the canonical Google Cloud docs for CustomJobSpec, TensorBoard data model (Tensorboard / TensorboardExperiment / TensorboardRun / TensorboardTimeSeries), and the auto-uploader contract. --- gigl/common/services/vertex_ai.py | 8 ++++++-- gigl/src/common/vertex_ai_launcher.py | 4 ++++ gigl/utils/tensorboard_writer.py | 5 ++++- .../research/gbml/gigl_resource_config.proto | 2 ++ .../research/gbml/trained_model_metadata.proto | 5 ++++- .../VertexAiResourceConfig.scala | 2 ++ .../TrainedModelMetadata.scala | 15 +++++++++------ .../VertexAiResourceConfig.scala | 2 ++ .../TrainedModelMetadata.scala | 15 +++++++++------ .../research/gbml/gigl_resource_config_pb2.pyi | 2 ++ .../research/gbml/trained_model_metadata_pb2.pyi | 6 +++++- 11 files changed, 49 insertions(+), 17 deletions(-) diff --git a/gigl/common/services/vertex_ai.py b/gigl/common/services/vertex_ai.py index cd119b321..6d3d865ad 100644 --- a/gigl/common/services/vertex_ai.py +++ b/gigl/common/services/vertex_ai.py @@ -134,7 +134,9 @@ class VertexAiJobConfig: Each field maps to a property on the ``WorkerPoolSpec`` / ``MachineSpec`` / ``DiskSpec`` / ``ContainerSpec`` protos that Vertex AI - uses to describe a CustomJob. + uses to describe a CustomJob. See + https://cloud.google.com/vertex-ai/docs/reference/rest/v1/CustomJobSpec + for the canonical reference. Example: >>> from google.cloud.aiplatform_v1.types import ReservationAffinity @@ -427,7 +429,9 @@ def _submit_job( # ``tensorboard_experiment_name`` is also set, the launcher injects # ``GIGL_TENSORBOARD_*`` env vars and the trainer's chief rank # additionally streams events to the user-named experiment via - # ``aiplatform.start_upload_tb_log``. + # ``aiplatform.start_upload_tb_log``. See + # https://cloud.google.com/vertex-ai/docs/experiments/tensorboard-training + # for Vertex's auto-uploader contract. job.submit( service_account=self._service_account, timeout=job_config.timeout_s, diff --git a/gigl/src/common/vertex_ai_launcher.py b/gigl/src/common/vertex_ai_launcher.py index f9cd7db67..32bdd7953 100644 --- a/gigl/src/common/vertex_ai_launcher.py +++ b/gigl/src/common/vertex_ai_launcher.py @@ -350,6 +350,10 @@ def _build_job_config( # ``TensorboardRun`` in the named experiment, so two jobs sharing # ``tensorboard_experiment_name`` show up as two runs (instead of merging # into one ``default`` run). + # + # References: + # https://cloud.google.com/vertex-ai/docs/experiments/tensorboard-overview + # https://cloud.google.com/vertex-ai/docs/reference/rest/v1/CustomJobSpec container_env_vars = list(env_vars) if ( tensorboard_experiment_name diff --git a/gigl/utils/tensorboard_writer.py b/gigl/utils/tensorboard_writer.py index aecefa121..6b39d76c9 100644 --- a/gigl/utils/tensorboard_writer.py +++ b/gigl/utils/tensorboard_writer.py @@ -189,7 +189,10 @@ def _maybe_start_uploader(*, parent_log_dir: str) -> bool: Watches ``parent_log_dir`` (not the run-name subdir under it), so the SDK's ``LogdirLoader`` discovers each run via - ``os.path.relpath(subdir, parent_log_dir)``. + ``os.path.relpath(subdir, parent_log_dir)``. The Vertex AI TensorBoard + data model (``Tensorboard`` → ``TensorboardExperiment`` → ``TensorboardRun`` + → ``TensorboardTimeSeries``) is documented at + https://cloud.google.com/vertex-ai/docs/experiments/tensorboard-overview. Returns ``True`` if the uploader was started (caller must arrange for ``aiplatform.end_upload_tb_log`` on shutdown), ``False`` otherwise. diff --git a/proto/snapchat/research/gbml/gigl_resource_config.proto b/proto/snapchat/research/gbml/gigl_resource_config.proto index f7e30bc8f..29f0c9762 100644 --- a/proto/snapchat/research/gbml/gigl_resource_config.proto +++ b/proto/snapchat/research/gbml/gigl_resource_config.proto @@ -133,6 +133,8 @@ message VertexAiResourceConfig { // Existing Vertex AI TensorBoard resource to attach to the job. // Format: projects/{project}/locations/{region}/tensorboards/{tensorboard_id} + // See https://cloud.google.com/vertex-ai/docs/experiments/tensorboard-overview + // for the Tensorboard data model. string tensorboard_resource_name = 10; } diff --git a/proto/snapchat/research/gbml/trained_model_metadata.proto b/proto/snapchat/research/gbml/trained_model_metadata.proto index 341133b5a..7c02de4ac 100644 --- a/proto/snapchat/research/gbml/trained_model_metadata.proto +++ b/proto/snapchat/research/gbml/trained_model_metadata.proto @@ -9,6 +9,9 @@ message TrainedModelMetadata{ string scripted_model_uri = 2; // The path where evaluation metrics are stored string eval_metrics_uri = 3; - // Path where tensorboard logs will be stored + // Path where tensorboard logs will be stored. Vertex AI maps this URI to + // ``AIP_TENSORBOARD_LOG_DIR`` inside trainer containers via + // ``CustomJobSpec.baseOutputDirectory``. See + // https://cloud.google.com/vertex-ai/docs/reference/rest/v1/CustomJobSpec. string tensorboard_logs_uri = 4; } diff --git a/scala/common/src/main/scala/snapchat/research/gbml/gigl_resource_config/VertexAiResourceConfig.scala b/scala/common/src/main/scala/snapchat/research/gbml/gigl_resource_config/VertexAiResourceConfig.scala index d2394a65e..8a29093bb 100644 --- a/scala/common/src/main/scala/snapchat/research/gbml/gigl_resource_config/VertexAiResourceConfig.scala +++ b/scala/common/src/main/scala/snapchat/research/gbml/gigl_resource_config/VertexAiResourceConfig.scala @@ -39,6 +39,8 @@ package snapchat.research.gbml.gigl_resource_config * @param tensorboardResourceName * Existing Vertex AI TensorBoard resource to attach to the job. * Format: projects/{project}/locations/{region}/tensorboards/{tensorboard_id} + * See https://cloud.google.com/vertex-ai/docs/experiments/tensorboard-overview + * for the Tensorboard data model. */ @SerialVersionUID(0L) final case class VertexAiResourceConfig( diff --git a/scala/common/src/main/scala/snapchat/research/gbml/trained_model_metadata/TrainedModelMetadata.scala b/scala/common/src/main/scala/snapchat/research/gbml/trained_model_metadata/TrainedModelMetadata.scala index bcf95c046..2ae44b3a5 100644 --- a/scala/common/src/main/scala/snapchat/research/gbml/trained_model_metadata/TrainedModelMetadata.scala +++ b/scala/common/src/main/scala/snapchat/research/gbml/trained_model_metadata/TrainedModelMetadata.scala @@ -12,7 +12,10 @@ package snapchat.research.gbml.trained_model_metadata * @param evalMetricsUri * The path where evaluation metrics are stored * @param tensorboardLogsUri - * Path where tensorboard logs will be stored + * Path where tensorboard logs will be stored. Vertex AI maps this URI to + * ``AIP_TENSORBOARD_LOG_DIR`` inside trainer containers via + * ``CustomJobSpec.baseOutputDirectory``. See + * https://cloud.google.com/vertex-ai/docs/reference/rest/v1/CustomJobSpec. */ @SerialVersionUID(0L) final case class TrainedModelMetadata( @@ -26,28 +29,28 @@ final case class TrainedModelMetadata( private[this] var __serializedSizeMemoized: _root_.scala.Int = 0 private[this] def __computeSerializedSize(): _root_.scala.Int = { var __size = 0 - + { val __value = trainedModelUri if (!__value.isEmpty) { __size += _root_.com.google.protobuf.CodedOutputStream.computeStringSize(1, __value) } }; - + { val __value = scriptedModelUri if (!__value.isEmpty) { __size += _root_.com.google.protobuf.CodedOutputStream.computeStringSize(2, __value) } }; - + { val __value = evalMetricsUri if (!__value.isEmpty) { __size += _root_.com.google.protobuf.CodedOutputStream.computeStringSize(3, __value) } }; - + { val __value = tensorboardLogsUri if (!__value.isEmpty) { @@ -64,7 +67,7 @@ final case class TrainedModelMetadata( __serializedSizeMemoized = __size } __size - 1 - + } def writeTo(`_output__`: _root_.com.google.protobuf.CodedOutputStream): _root_.scala.Unit = { { diff --git a/scala_spark35/common/src/main/scala/snapchat/research/gbml/gigl_resource_config/VertexAiResourceConfig.scala b/scala_spark35/common/src/main/scala/snapchat/research/gbml/gigl_resource_config/VertexAiResourceConfig.scala index d2394a65e..8a29093bb 100644 --- a/scala_spark35/common/src/main/scala/snapchat/research/gbml/gigl_resource_config/VertexAiResourceConfig.scala +++ b/scala_spark35/common/src/main/scala/snapchat/research/gbml/gigl_resource_config/VertexAiResourceConfig.scala @@ -39,6 +39,8 @@ package snapchat.research.gbml.gigl_resource_config * @param tensorboardResourceName * Existing Vertex AI TensorBoard resource to attach to the job. * Format: projects/{project}/locations/{region}/tensorboards/{tensorboard_id} + * See https://cloud.google.com/vertex-ai/docs/experiments/tensorboard-overview + * for the Tensorboard data model. */ @SerialVersionUID(0L) final case class VertexAiResourceConfig( diff --git a/scala_spark35/common/src/main/scala/snapchat/research/gbml/trained_model_metadata/TrainedModelMetadata.scala b/scala_spark35/common/src/main/scala/snapchat/research/gbml/trained_model_metadata/TrainedModelMetadata.scala index bcf95c046..2ae44b3a5 100644 --- a/scala_spark35/common/src/main/scala/snapchat/research/gbml/trained_model_metadata/TrainedModelMetadata.scala +++ b/scala_spark35/common/src/main/scala/snapchat/research/gbml/trained_model_metadata/TrainedModelMetadata.scala @@ -12,7 +12,10 @@ package snapchat.research.gbml.trained_model_metadata * @param evalMetricsUri * The path where evaluation metrics are stored * @param tensorboardLogsUri - * Path where tensorboard logs will be stored + * Path where tensorboard logs will be stored. Vertex AI maps this URI to + * ``AIP_TENSORBOARD_LOG_DIR`` inside trainer containers via + * ``CustomJobSpec.baseOutputDirectory``. See + * https://cloud.google.com/vertex-ai/docs/reference/rest/v1/CustomJobSpec. */ @SerialVersionUID(0L) final case class TrainedModelMetadata( @@ -26,28 +29,28 @@ final case class TrainedModelMetadata( private[this] var __serializedSizeMemoized: _root_.scala.Int = 0 private[this] def __computeSerializedSize(): _root_.scala.Int = { var __size = 0 - + { val __value = trainedModelUri if (!__value.isEmpty) { __size += _root_.com.google.protobuf.CodedOutputStream.computeStringSize(1, __value) } }; - + { val __value = scriptedModelUri if (!__value.isEmpty) { __size += _root_.com.google.protobuf.CodedOutputStream.computeStringSize(2, __value) } }; - + { val __value = evalMetricsUri if (!__value.isEmpty) { __size += _root_.com.google.protobuf.CodedOutputStream.computeStringSize(3, __value) } }; - + { val __value = tensorboardLogsUri if (!__value.isEmpty) { @@ -64,7 +67,7 @@ final case class TrainedModelMetadata( __serializedSizeMemoized = __size } __size - 1 - + } def writeTo(`_output__`: _root_.com.google.protobuf.CodedOutputStream): _root_.scala.Unit = { { diff --git a/snapchat/research/gbml/gigl_resource_config_pb2.pyi b/snapchat/research/gbml/gigl_resource_config_pb2.pyi index 8522294a9..250c69973 100644 --- a/snapchat/research/gbml/gigl_resource_config_pb2.pyi +++ b/snapchat/research/gbml/gigl_resource_config_pb2.pyi @@ -298,6 +298,8 @@ class VertexAiResourceConfig(google.protobuf.message.Message): tensorboard_resource_name: builtins.str """Existing Vertex AI TensorBoard resource to attach to the job. Format: projects/{project}/locations/{region}/tensorboards/{tensorboard_id} + See https://cloud.google.com/vertex-ai/docs/experiments/tensorboard-overview + for the Tensorboard data model. """ def __init__( self, diff --git a/snapchat/research/gbml/trained_model_metadata_pb2.pyi b/snapchat/research/gbml/trained_model_metadata_pb2.pyi index 5bdb95d48..9fa9f7886 100644 --- a/snapchat/research/gbml/trained_model_metadata_pb2.pyi +++ b/snapchat/research/gbml/trained_model_metadata_pb2.pyi @@ -28,7 +28,11 @@ class TrainedModelMetadata(google.protobuf.message.Message): eval_metrics_uri: builtins.str """The path where evaluation metrics are stored""" tensorboard_logs_uri: builtins.str - """Path where tensorboard logs will be stored""" + """Path where tensorboard logs will be stored. Vertex AI maps this URI to + ``AIP_TENSORBOARD_LOG_DIR`` inside trainer containers via + ``CustomJobSpec.baseOutputDirectory``. See + https://cloud.google.com/vertex-ai/docs/reference/rest/v1/CustomJobSpec. + """ def __init__( self, *, From 5e4d9b2741e045b13b9ac7bcf90e576fe47659e6 Mon Sep 17 00:00:00 2001 From: kmontemayor Date: Tue, 5 May 2026 23:19:55 +0000 Subject: [PATCH 48/59] proto: move tensorboard_experiment_name to VertexAiResourceConfig MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per-environment metaparam, just like tensorboard_resource_name. Single-pool trainers read it off their VertexAiResourceConfig directly; graph-store trainers read it off compute_pool (which is itself a VertexAiResourceConfig), matching how other Vertex AI metaparams already flow. This lets the launcher signature shrink: launch_single_pool_job / launch_graph_store_enabled_job / _build_job_config no longer take tensorboard_experiment_name as a parameter — they pull it from the proto. The v2 GLT trainer correspondingly drops the gbml_config extraction. Validation simplifies: both fields now live on the same proto object, so the single-dispatch pattern that already handled tensorboard_resource_name now handles experiment_name too. Tests rewritten to set the field on the resource config proto. The tests/unit/src/training/glt_trainer_test.py file now tests dispatch-by-type since the experiment-name extraction code path is gone. --- gigl/src/common/vertex_ai_launcher.py | 34 +++-- gigl/src/training/v2/glt_trainer.py | 5 - ...nd_resource_config_compatibility_checks.py | 44 ++---- .../snapchat/research/gbml/gbml_config.proto | 11 -- .../research/gbml/gigl_resource_config.proto | 10 ++ .../gbml/gbml_config/GbmlConfig.scala | 40 ----- .../gbml/gbml_config/GbmlConfigProto.scala | 61 ++++---- .../GiglResourceConfigProto.scala | 141 +++++++++--------- .../VertexAiResourceConfig.scala | 47 +++++- .../gbml/gbml_config/GbmlConfig.scala | 40 ----- .../gbml/gbml_config/GbmlConfigProto.scala | 61 ++++---- .../GiglResourceConfigProto.scala | 141 +++++++++--------- .../VertexAiResourceConfig.scala | 47 +++++- snapchat/research/gbml/gbml_config_pb2.py | 46 +++--- snapchat/research/gbml/gbml_config_pb2.pyi | 15 +- .../research/gbml/gigl_resource_config_pb2.py | 48 +++--- .../gbml/gigl_resource_config_pb2.pyi | 14 +- .../src/common/vertex_ai_launcher_test.py | 77 ++++------ tests/unit/src/training/glt_trainer_test.py | 79 +++------- ...source_config_compatibility_checks_test.py | 58 +++++-- 20 files changed, 487 insertions(+), 532 deletions(-) diff --git a/gigl/src/common/vertex_ai_launcher.py b/gigl/src/common/vertex_ai_launcher.py index 32bdd7953..955b6b4ed 100644 --- a/gigl/src/common/vertex_ai_launcher.py +++ b/gigl/src/common/vertex_ai_launcher.py @@ -87,10 +87,13 @@ def launch_single_pool_job( component: GiGLComponents, vertex_ai_region: str, tensorboard_logs_uri: Optional[Uri] = None, - tensorboard_experiment_name: Optional[str] = None, ) -> aiplatform.CustomJob: """Launch a single pool job on Vertex AI. + The ``tensorboard_resource_name`` and ``tensorboard_experiment_name`` + fields on ``vertex_ai_resource_config`` drive TensorBoard wiring; the + launcher reads them directly off the proto. + Args: vertex_ai_resource_config: The Vertex AI resource configuration job_name: Full name for the Vertex AI job @@ -104,10 +107,6 @@ def launch_single_pool_job( component: The GiGL component (Trainer or Inferencer) vertex_ai_region: The Vertex AI region to launch the job in tensorboard_logs_uri: Optional TensorBoard log URI for trainer jobs - tensorboard_experiment_name: Optional Vertex AI Experiment name. When set, - the trainer's CustomJob is submitted as a run of the named experiment so - multiple jobs sharing the name can be compared on a single TensorBoard - page. See ``VertexAiJobConfig.tensorboard_experiment_name``. Returns: The submitted ``aiplatform.CustomJob``. Useful for callers that need @@ -137,7 +136,6 @@ def launch_single_pool_job( env_vars=[env_var.EnvVar(name="TF_CPP_MIN_LOG_LEVEL", value="3")], labels=resource_config_wrapper.get_resource_labels(component=component), tensorboard_logs_uri=tensorboard_logs_uri, - tensorboard_experiment_name=tensorboard_experiment_name, ) logger.info(f"Launching {component.value} job with config: {job_config}") @@ -164,10 +162,15 @@ def launch_graph_store_enabled_job( cuda_docker_uri: Optional[str], component: GiGLComponents, tensorboard_logs_uri: Optional[Uri] = None, - tensorboard_experiment_name: Optional[str] = None, ) -> None: """Launch a graph store enabled job on Vertex AI with separate storage and compute pools. + The ``compute_pool`` of ``vertex_ai_graph_store_config`` carries + ``tensorboard_resource_name`` and ``tensorboard_experiment_name`` (the + same Vertex AI metaparams that single-pool reads off its own + ``VertexAiResourceConfig``); the launcher reads them directly off the + proto. + Args: vertex_ai_graph_store_config: The Vertex AI graph store configuration job_name: Full name for the Vertex AI job @@ -182,10 +185,6 @@ def launch_graph_store_enabled_job( cuda_docker_uri: Docker image URI for GPU execution component: The GiGL component (Trainer or Inferencer) tensorboard_logs_uri: Optional TensorBoard log URI for trainer jobs - tensorboard_experiment_name: Optional Vertex AI Experiment name. When set, - the trainer's CustomJob is submitted as a run of the named experiment so - multiple jobs sharing the name can be compared on a single TensorBoard - page. See ``VertexAiJobConfig.tensorboard_experiment_name``. """ if component not in _LAUNCHABLE_COMPONENTS: raise ValueError( @@ -240,7 +239,6 @@ def launch_graph_store_enabled_job( env_vars=environment_variables, labels=labels, tensorboard_logs_uri=tensorboard_logs_uri, - tensorboard_experiment_name=tensorboard_experiment_name, ) # Create storage pool job config @@ -288,7 +286,6 @@ def _build_job_config( env_vars: list[env_var.EnvVar], labels: Optional[dict[str, str]] = None, tensorboard_logs_uri: Optional[Uri] = None, - tensorboard_experiment_name: Optional[str] = None, ) -> VertexAiJobConfig: """Build a VertexAiJobConfig for training or inference jobs. @@ -296,6 +293,11 @@ def _build_job_config( jobs on Vertex AI. It assembles job arguments, sets appropriate job naming conventions, and configures resource specifications based on the provided parameters. + ``tensorboard_resource_name`` and ``tensorboard_experiment_name`` come + from ``vertex_ai_resource_config`` directly — single-pool launches read + them off the trainer's ``VertexAiResourceConfig``; graph-store launches + pass ``compute_pool`` here, which carries the same fields. + Args: job_name (str): The base name for the job. Will be prefixed with "gigl_train_" or "gigl_infer_". is_inference (bool): Whether this is an inference job (True) or training job (False). @@ -310,13 +312,13 @@ def _build_job_config( env_vars (list[env_var.EnvVar]): Environment variables to set in the container. labels (Optional[dict[str, str]]): Labels to associate with the job. Defaults to None. tensorboard_logs_uri (Optional[Uri]): TensorBoard log URI for trainer jobs. - tensorboard_experiment_name (Optional[str]): If set, the job is - submitted as a run of the named Vertex AI Experiment. See - ``VertexAiJobConfig.tensorboard_experiment_name``. Returns: VertexAiJobConfig: A configuration object ready to be used with VertexAIService.launch_job(). """ + tensorboard_experiment_name = ( + vertex_ai_resource_config.tensorboard_experiment_name or None + ) job_args = ( [ f"--job_name={job_name}", diff --git a/gigl/src/training/v2/glt_trainer.py b/gigl/src/training/v2/glt_trainer.py index e95ba22c8..4f2ecadd1 100644 --- a/gigl/src/training/v2/glt_trainer.py +++ b/gigl/src/training/v2/glt_trainer.py @@ -60,9 +60,6 @@ def __execute_VAI_training( if raw_tensorboard_logs_uri else None ) - tensorboard_experiment_name = ( - gbml_config_pb_wrapper.trainer_config.tensorboard_experiment_name or None - ) job_name = f"gigl_train_{applied_task_identifier}" @@ -80,7 +77,6 @@ def __execute_VAI_training( component=GiGLComponents.Trainer, vertex_ai_region=resource_config.vertex_ai_trainer_region, tensorboard_logs_uri=tensorboard_logs_uri, - tensorboard_experiment_name=tensorboard_experiment_name, ) elif isinstance(resource_config.trainer_config, VertexAiGraphStoreConfig): launch_graph_store_enabled_job( @@ -97,7 +93,6 @@ def __execute_VAI_training( cuda_docker_uri=cuda_docker_uri, component=GiGLComponents.Trainer, tensorboard_logs_uri=tensorboard_logs_uri, - tensorboard_experiment_name=tensorboard_experiment_name, ) else: raise NotImplementedError( diff --git a/gigl/src/validation_check/libs/gbml_and_resource_config_compatibility_checks.py b/gigl/src/validation_check/libs/gbml_and_resource_config_compatibility_checks.py index ff124615a..2703915d1 100644 --- a/gigl/src/validation_check/libs/gbml_and_resource_config_compatibility_checks.py +++ b/gigl/src/validation_check/libs/gbml_and_resource_config_compatibility_checks.py @@ -120,44 +120,32 @@ def check_vertex_ai_trainer_tensorboard_compatibility( "Config validation check: Vertex AI trainer TensorBoard compatibility between template and resource configs." ) - experiment_name = gbml_config_pb_wrapper.trainer_config.tensorboard_experiment_name - if experiment_name: - trainer_resource_config = resource_config_wrapper.trainer_config - if isinstance( - trainer_resource_config, gigl_resource_config_pb2.VertexAiResourceConfig - ): - tb_resource = trainer_resource_config.tensorboard_resource_name - elif isinstance( - trainer_resource_config, gigl_resource_config_pb2.VertexAiGraphStoreConfig - ): - tb_resource = trainer_resource_config.compute_pool.tensorboard_resource_name - else: - tb_resource = "" - assert tb_resource, ( - "GbmlConfig.trainer_config.tensorboard_experiment_name is set " - f"({experiment_name!r}) but no Vertex AI TensorBoard resource is " - "configured on the trainer resource config; the experiment needs a " - "backing TB resource." - ) - - if not gbml_config_pb_wrapper.trainer_config.should_log_to_tensorboard: - return - trainer_resource_config = resource_config_wrapper.trainer_config if isinstance( trainer_resource_config, gigl_resource_config_pb2.VertexAiResourceConfig ): - tensorboard_resource_name = trainer_resource_config.tensorboard_resource_name + vertex_ai_config = trainer_resource_config elif isinstance( trainer_resource_config, gigl_resource_config_pb2.VertexAiGraphStoreConfig ): - tensorboard_resource_name = ( - trainer_resource_config.compute_pool.tensorboard_resource_name - ) + # Graph-store mode reads TB metaparams from the compute pool, the + # same way it reads other Vertex AI resource fields. + vertex_ai_config = trainer_resource_config.compute_pool else: return - assert tensorboard_resource_name, ( + if vertex_ai_config.tensorboard_experiment_name: + assert vertex_ai_config.tensorboard_resource_name, ( + "VertexAiResourceConfig.tensorboard_experiment_name is set " + f"({vertex_ai_config.tensorboard_experiment_name!r}) but no " + "Vertex AI TensorBoard resource is configured; the experiment " + "needs a backing TB resource." + ) + + if not gbml_config_pb_wrapper.trainer_config.should_log_to_tensorboard: + return + + assert vertex_ai_config.tensorboard_resource_name, ( "GbmlConfig.trainer_config.should_log_to_tensorboard is true, so a " "Vertex AI TensorBoard resource name must be set in the trainer " "resource config." diff --git a/proto/snapchat/research/gbml/gbml_config.proto b/proto/snapchat/research/gbml/gbml_config.proto index b2f5c7aa1..b8e50d834 100644 --- a/proto/snapchat/research/gbml/gbml_config.proto +++ b/proto/snapchat/research/gbml/gbml_config.proto @@ -201,17 +201,6 @@ message GbmlConfig { // Weather to log to tensorboard or not (defaults to false) bool should_log_to_tensorboard = 12; - // Optional. When set, the trainer's chief rank streams events to a - // TensorboardExperiment with this name on the configured Tensorboard - // resource, in addition to Vertex's built-in per-job auto-upload. - // Multiple jobs that share the same value land in the same - // TensorboardExperiment, so they appear as comparable runs on one - // TensorBoard page. Requires - // GiglResourceConfig...tensorboard_resource_name to be set. Allowed - // characters: lowercase letters, digits, hyphens (Vertex AI Experiment - // ID rules). - string tensorboard_experiment_name = 14; - // Configuration for GraphStore storage. // If setup, then GiGLResourceConfig.trainer_resource_config.vertex_ai_graph_store_trainer_config must be set. // e.g. With separte job configs for storage and compute jobs. diff --git a/proto/snapchat/research/gbml/gigl_resource_config.proto b/proto/snapchat/research/gbml/gigl_resource_config.proto index 29f0c9762..292910a1c 100644 --- a/proto/snapchat/research/gbml/gigl_resource_config.proto +++ b/proto/snapchat/research/gbml/gigl_resource_config.proto @@ -136,6 +136,16 @@ message VertexAiResourceConfig { // See https://cloud.google.com/vertex-ai/docs/experiments/tensorboard-overview // for the Tensorboard data model. string tensorboard_resource_name = 10; + + // Optional. When set, the trainer's chief rank streams events to a + // TensorboardExperiment with this name on the TB resource above, in + // addition to Vertex's per-job auto-upload. Multiple jobs that share this + // value land in the same TensorboardExperiment, so they appear as + // comparable runs on one TensorBoard page. Requires + // tensorboard_resource_name above to be set. Allowed characters: + // lowercase letters, digits, hyphens (Vertex AI Experiment ID rules). + // See https://cloud.google.com/vertex-ai/docs/experiments/tensorboard-overview. + string tensorboard_experiment_name = 11; } // Configuration for KFP job resources diff --git a/scala/common/src/main/scala/snapchat/research/gbml/gbml_config/GbmlConfig.scala b/scala/common/src/main/scala/snapchat/research/gbml/gbml_config/GbmlConfig.scala index 658cd15ae..63c31ede6 100644 --- a/scala/common/src/main/scala/snapchat/research/gbml/gbml_config/GbmlConfig.scala +++ b/scala/common/src/main/scala/snapchat/research/gbml/gbml_config/GbmlConfig.scala @@ -3966,16 +3966,6 @@ object GbmlConfig extends scalapb.GeneratedMessageCompanion[snapchat.research.gb * Arguments to parameterize training process with. * @param shouldLogToTensorboard * Weather to log to tensorboard or not (defaults to false) - * @param tensorboardExperimentName - * Optional. When set, the trainer's chief rank streams events to a - * TensorboardExperiment with this name on the configured Tensorboard - * resource, in addition to Vertex's built-in per-job auto-upload. - * Multiple jobs that share the same value land in the same - * TensorboardExperiment, so they appear as comparable runs on one - * TensorBoard page. Requires - * GiglResourceConfig...tensorboard_resource_name to be set. Allowed - * characters: lowercase letters, digits, hyphens (Vertex AI Experiment - * ID rules). */ @SerialVersionUID(0L) final case class TrainerConfig( @@ -3983,7 +3973,6 @@ object GbmlConfig extends scalapb.GeneratedMessageCompanion[snapchat.research.gb trainerArgs: _root_.scala.collection.immutable.Map[_root_.scala.Predef.String, _root_.scala.Predef.String] = _root_.scala.collection.immutable.Map.empty, executable: snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.Executable = snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.Executable.Empty, shouldLogToTensorboard: _root_.scala.Boolean = false, - tensorboardExperimentName: _root_.scala.Predef.String = "", storageConfig: snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.StorageConfig = snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.StorageConfig.Empty, unknownFields: _root_.scalapb.UnknownFieldSet = _root_.scalapb.UnknownFieldSet.empty ) extends scalapb.GeneratedMessage with scalapb.lenses.Updatable[TrainerConfig] { @@ -4017,13 +4006,6 @@ object GbmlConfig extends scalapb.GeneratedMessageCompanion[snapchat.research.gb __size += _root_.com.google.protobuf.CodedOutputStream.computeBoolSize(12, __value) } }; - - { - val __value = tensorboardExperimentName - if (!__value.isEmpty) { - __size += _root_.com.google.protobuf.CodedOutputStream.computeStringSize(14, __value) - } - }; if (storageConfig.graphStoreStorageConfig.isDefined) { val __value = storageConfig.graphStoreStorageConfig.get __size += 1 + _root_.com.google.protobuf.CodedOutputStream.computeUInt32SizeNoTag(__value.serializedSize) + __value.serializedSize @@ -4065,12 +4047,6 @@ object GbmlConfig extends scalapb.GeneratedMessageCompanion[snapchat.research.gb _output__.writeUInt32NoTag(__m.serializedSize) __m.writeTo(_output__) }; - { - val __v = tensorboardExperimentName - if (!__v.isEmpty) { - _output__.writeString(14, __v) - } - }; executable.clsPath.foreach { __v => val __m = __v _output__.writeString(100, __m) @@ -4091,7 +4067,6 @@ object GbmlConfig extends scalapb.GeneratedMessageCompanion[snapchat.research.gb def getCommand: _root_.scala.Predef.String = executable.command.getOrElse("") def withCommand(__v: _root_.scala.Predef.String): TrainerConfig = copy(executable = snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.Executable.Command(__v)) def withShouldLogToTensorboard(__v: _root_.scala.Boolean): TrainerConfig = copy(shouldLogToTensorboard = __v) - def withTensorboardExperimentName(__v: _root_.scala.Predef.String): TrainerConfig = copy(tensorboardExperimentName = __v) def getGraphStoreStorageConfig: snapchat.research.gbml.gbml_config.GbmlConfig.GraphStoreStorageConfig = storageConfig.graphStoreStorageConfig.getOrElse(snapchat.research.gbml.gbml_config.GbmlConfig.GraphStoreStorageConfig.defaultInstance) def withGraphStoreStorageConfig(__v: snapchat.research.gbml.gbml_config.GbmlConfig.GraphStoreStorageConfig): TrainerConfig = copy(storageConfig = snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.StorageConfig.GraphStoreStorageConfig(__v)) def clearExecutable: TrainerConfig = copy(executable = snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.Executable.Empty) @@ -4113,10 +4088,6 @@ object GbmlConfig extends scalapb.GeneratedMessageCompanion[snapchat.research.gb val __t = shouldLogToTensorboard if (__t != false) __t else null } - case 14 => { - val __t = tensorboardExperimentName - if (__t != "") __t else null - } case 13 => storageConfig.graphStoreStorageConfig.orNull } } @@ -4128,7 +4099,6 @@ object GbmlConfig extends scalapb.GeneratedMessageCompanion[snapchat.research.gb case 100 => executable.clsPath.map(_root_.scalapb.descriptors.PString(_)).getOrElse(_root_.scalapb.descriptors.PEmpty) case 101 => executable.command.map(_root_.scalapb.descriptors.PString(_)).getOrElse(_root_.scalapb.descriptors.PEmpty) case 12 => _root_.scalapb.descriptors.PBoolean(shouldLogToTensorboard) - case 14 => _root_.scalapb.descriptors.PString(tensorboardExperimentName) case 13 => storageConfig.graphStoreStorageConfig.map(_.toPMessage).getOrElse(_root_.scalapb.descriptors.PEmpty) } } @@ -4143,7 +4113,6 @@ object GbmlConfig extends scalapb.GeneratedMessageCompanion[snapchat.research.gb var __trainerClsPath: _root_.scala.Predef.String = "" val __trainerArgs: _root_.scala.collection.mutable.Builder[(_root_.scala.Predef.String, _root_.scala.Predef.String), _root_.scala.collection.immutable.Map[_root_.scala.Predef.String, _root_.scala.Predef.String]] = _root_.scala.collection.immutable.Map.newBuilder[_root_.scala.Predef.String, _root_.scala.Predef.String] var __shouldLogToTensorboard: _root_.scala.Boolean = false - var __tensorboardExperimentName: _root_.scala.Predef.String = "" var __executable: snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.Executable = snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.Executable.Empty var __storageConfig: snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.StorageConfig = snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.StorageConfig.Empty var `_unknownFields__`: _root_.scalapb.UnknownFieldSet.Builder = null @@ -4162,8 +4131,6 @@ object GbmlConfig extends scalapb.GeneratedMessageCompanion[snapchat.research.gb __executable = snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.Executable.Command(_input__.readStringRequireUtf8()) case 96 => __shouldLogToTensorboard = _input__.readBool() - case 114 => - __tensorboardExperimentName = _input__.readStringRequireUtf8() case 106 => __storageConfig = snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.StorageConfig.GraphStoreStorageConfig(__storageConfig.graphStoreStorageConfig.fold(_root_.scalapb.LiteParser.readMessage[snapchat.research.gbml.gbml_config.GbmlConfig.GraphStoreStorageConfig](_input__))(_root_.scalapb.LiteParser.readMessage(_input__, _))) case tag => @@ -4177,7 +4144,6 @@ object GbmlConfig extends scalapb.GeneratedMessageCompanion[snapchat.research.gb trainerClsPath = __trainerClsPath, trainerArgs = __trainerArgs.result(), shouldLogToTensorboard = __shouldLogToTensorboard, - tensorboardExperimentName = __tensorboardExperimentName, executable = __executable, storageConfig = __storageConfig, unknownFields = if (_unknownFields__ == null) _root_.scalapb.UnknownFieldSet.empty else _unknownFields__.result() @@ -4190,7 +4156,6 @@ object GbmlConfig extends scalapb.GeneratedMessageCompanion[snapchat.research.gb trainerClsPath = __fieldsMap.get(scalaDescriptor.findFieldByNumber(1).get).map(_.as[_root_.scala.Predef.String]).getOrElse(""), trainerArgs = __fieldsMap.get(scalaDescriptor.findFieldByNumber(2).get).map(_.as[_root_.scala.Seq[snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.TrainerArgsEntry]]).getOrElse(_root_.scala.Seq.empty).iterator.map(snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig._typemapper_trainerArgs.toCustom(_)).toMap, shouldLogToTensorboard = __fieldsMap.get(scalaDescriptor.findFieldByNumber(12).get).map(_.as[_root_.scala.Boolean]).getOrElse(false), - tensorboardExperimentName = __fieldsMap.get(scalaDescriptor.findFieldByNumber(14).get).map(_.as[_root_.scala.Predef.String]).getOrElse(""), executable = __fieldsMap.get(scalaDescriptor.findFieldByNumber(100).get).flatMap(_.as[_root_.scala.Option[_root_.scala.Predef.String]]).map(snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.Executable.ClsPath(_)) .orElse[snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.Executable](__fieldsMap.get(scalaDescriptor.findFieldByNumber(101).get).flatMap(_.as[_root_.scala.Option[_root_.scala.Predef.String]]).map(snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.Executable.Command(_))) .getOrElse(snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.Executable.Empty), @@ -4218,7 +4183,6 @@ object GbmlConfig extends scalapb.GeneratedMessageCompanion[snapchat.research.gb trainerClsPath = "", trainerArgs = _root_.scala.collection.immutable.Map.empty, shouldLogToTensorboard = false, - tensorboardExperimentName = "", executable = snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.Executable.Empty, storageConfig = snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.StorageConfig.Empty ) @@ -4429,7 +4393,6 @@ object GbmlConfig extends scalapb.GeneratedMessageCompanion[snapchat.research.gb def clsPath: _root_.scalapb.lenses.Lens[UpperPB, _root_.scala.Predef.String] = field(_.getClsPath)((c_, f_) => c_.copy(executable = snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.Executable.ClsPath(f_))) def command: _root_.scalapb.lenses.Lens[UpperPB, _root_.scala.Predef.String] = field(_.getCommand)((c_, f_) => c_.copy(executable = snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.Executable.Command(f_))) def shouldLogToTensorboard: _root_.scalapb.lenses.Lens[UpperPB, _root_.scala.Boolean] = field(_.shouldLogToTensorboard)((c_, f_) => c_.copy(shouldLogToTensorboard = f_)) - def tensorboardExperimentName: _root_.scalapb.lenses.Lens[UpperPB, _root_.scala.Predef.String] = field(_.tensorboardExperimentName)((c_, f_) => c_.copy(tensorboardExperimentName = f_)) def graphStoreStorageConfig: _root_.scalapb.lenses.Lens[UpperPB, snapchat.research.gbml.gbml_config.GbmlConfig.GraphStoreStorageConfig] = field(_.getGraphStoreStorageConfig)((c_, f_) => c_.copy(storageConfig = snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.StorageConfig.GraphStoreStorageConfig(f_))) def executable: _root_.scalapb.lenses.Lens[UpperPB, snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.Executable] = field(_.executable)((c_, f_) => c_.copy(executable = f_)) def storageConfig: _root_.scalapb.lenses.Lens[UpperPB, snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.StorageConfig] = field(_.storageConfig)((c_, f_) => c_.copy(storageConfig = f_)) @@ -4439,7 +4402,6 @@ object GbmlConfig extends scalapb.GeneratedMessageCompanion[snapchat.research.gb final val CLS_PATH_FIELD_NUMBER = 100 final val COMMAND_FIELD_NUMBER = 101 final val SHOULD_LOG_TO_TENSORBOARD_FIELD_NUMBER = 12 - final val TENSORBOARD_EXPERIMENT_NAME_FIELD_NUMBER = 14 final val GRAPH_STORE_STORAGE_CONFIG_FIELD_NUMBER = 13 @transient private[gbml_config] val _typemapper_trainerArgs: _root_.scalapb.TypeMapper[snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.TrainerArgsEntry, (_root_.scala.Predef.String, _root_.scala.Predef.String)] = implicitly[_root_.scalapb.TypeMapper[snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.TrainerArgsEntry, (_root_.scala.Predef.String, _root_.scala.Predef.String)]] @@ -4448,14 +4410,12 @@ object GbmlConfig extends scalapb.GeneratedMessageCompanion[snapchat.research.gb trainerArgs: _root_.scala.collection.immutable.Map[_root_.scala.Predef.String, _root_.scala.Predef.String], executable: snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.Executable, shouldLogToTensorboard: _root_.scala.Boolean, - tensorboardExperimentName: _root_.scala.Predef.String, storageConfig: snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.StorageConfig ): _root_.snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig = _root_.snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig( trainerClsPath, trainerArgs, executable, shouldLogToTensorboard, - tensorboardExperimentName, storageConfig ) // @@protoc_insertion_point(GeneratedMessageCompanion[snapchat.research.gbml.GbmlConfig.TrainerConfig]) diff --git a/scala/common/src/main/scala/snapchat/research/gbml/gbml_config/GbmlConfigProto.scala b/scala/common/src/main/scala/snapchat/research/gbml/gbml_config/GbmlConfigProto.scala index 5aa5dd8a7..a9c35d542 100644 --- a/scala/common/src/main/scala/snapchat/research/gbml/gbml_config/GbmlConfigProto.scala +++ b/scala/common/src/main/scala/snapchat/research/gbml/gbml_config/GbmlConfigProto.scala @@ -26,7 +26,7 @@ object GbmlConfigProto extends _root_.scalapb.GeneratedFileObject { GhfbWV0YWRhdGEucHJvdG8aLXNuYXBjaGF0L3Jlc2VhcmNoL2dibWwvZGF0YXNldF9tZXRhZGF0YS5wcm90bxozc25hcGNoYXQvc mVzZWFyY2gvZ2JtbC90cmFpbmVkX21vZGVsX21ldGFkYXRhLnByb3RvGi9zbmFwY2hhdC9yZXNlYXJjaC9nYm1sL2luZmVyZW5jZ V9tZXRhZGF0YS5wcm90bxozc25hcGNoYXQvcmVzZWFyY2gvZ2JtbC9wb3N0cHJvY2Vzc2VkX21ldGFkYXRhLnByb3RvGjdzbmFwY - 2hhdC9yZXNlYXJjaC9nYm1sL3N1YmdyYXBoX3NhbXBsaW5nX3N0cmF0ZWd5LnByb3RvIqpNCgpHYm1sQ29uZmlnEmcKDXRhc2tfb + 2hhdC9yZXNlYXJjaC9nYm1sL3N1YmdyYXBoX3NhbXBsaW5nX3N0cmF0ZWd5LnByb3RvIspMCgpHYm1sQ29uZmlnEmcKDXRhc2tfb WV0YWRhdGEYASABKAsyLy5zbmFwY2hhdC5yZXNlYXJjaC5nYm1sLkdibWxDb25maWcuVGFza01ldGFkYXRhQhHiPw4SDHRhc2tNZ XRhZGF0YVIMdGFza01ldGFkYXRhEmAKDmdyYXBoX21ldGFkYXRhGAIgASgLMiUuc25hcGNoYXQucmVzZWFyY2guZ2JtbC5HcmFwa E1ldGFkYXRhQhLiPw8SDWdyYXBoTWV0YWRhdGFSDWdyYXBoTWV0YWRhdGESZwoNc2hhcmVkX2NvbmZpZxgDIAEoCzIvLnNuYXBja @@ -125,40 +125,39 @@ object GbmlConfigProto extends _root_.scalapb.GeneratedFileObject { BgBIAEoCUIM4j8JEgdjb21tYW5kUgdjb21tYW5kEoABCgxzdG9yYWdlX2FyZ3MYAiADKAsySy5zbmFwY2hhdC5yZXNlYXJjaC5nY m1sLkdibWxDb25maWcuR3JhcGhTdG9yZVN0b3JhZ2VDb25maWcuU3RvcmFnZUFyZ3NFbnRyeUIQ4j8NEgtzdG9yYWdlQXJnc1ILc 3RvcmFnZUFyZ3MaVAoQU3RvcmFnZUFyZ3NFbnRyeRIaCgNrZXkYASABKAlCCOI/BRIDa2V5UgNrZXkSIAoFdmFsdWUYAiABKAlCC - uI/BxIFdmFsdWVSBXZhbHVlOgI4ARrjBQoNVHJhaW5lckNvbmZpZxI9ChB0cmFpbmVyX2Nsc19wYXRoGAEgASgJQhPiPxASDnRyY + uI/BxIFdmFsdWVSBXZhbHVlOgI4ARqDBQoNVHJhaW5lckNvbmZpZxI9ChB0cmFpbmVyX2Nsc19wYXRoGAEgASgJQhPiPxASDnRyY WluZXJDbHNQYXRoUg50cmFpbmVyQ2xzUGF0aBJ2Cgx0cmFpbmVyX2FyZ3MYAiADKAsyQS5zbmFwY2hhdC5yZXNlYXJjaC5nYm1sL kdibWxDb25maWcuVHJhaW5lckNvbmZpZy5UcmFpbmVyQXJnc0VudHJ5QhDiPw0SC3RyYWluZXJBcmdzUgt0cmFpbmVyQXJncxIpC ghjbHNfcGF0aBhkIAEoCUIM4j8JEgdjbHNQYXRoSABSB2Nsc1BhdGgSKAoHY29tbWFuZBhlIAEoCUIM4j8JEgdjb21tYW5kSABSB 2NvbW1hbmQSVgoZc2hvdWxkX2xvZ190b190ZW5zb3Jib2FyZBgMIAEoCEIb4j8YEhZzaG91bGRMb2dUb1RlbnNvcmJvYXJkUhZza - G91bGRMb2dUb1RlbnNvcmJvYXJkEl4KG3RlbnNvcmJvYXJkX2V4cGVyaW1lbnRfbmFtZRgOIAEoCUIe4j8bEhl0ZW5zb3Jib2FyZ - EV4cGVyaW1lbnROYW1lUhl0ZW5zb3Jib2FyZEV4cGVyaW1lbnROYW1lEpcBChpncmFwaF9zdG9yZV9zdG9yYWdlX2NvbmZpZxgNI - AEoCzI6LnNuYXBjaGF0LnJlc2VhcmNoLmdibWwuR2JtbENvbmZpZy5HcmFwaFN0b3JlU3RvcmFnZUNvbmZpZ0Ic4j8ZEhdncmFwa - FN0b3JlU3RvcmFnZUNvbmZpZ0gBUhdncmFwaFN0b3JlU3RvcmFnZUNvbmZpZxpUChBUcmFpbmVyQXJnc0VudHJ5EhoKA2tleRgBI - AEoCUII4j8FEgNrZXlSA2tleRIgCgV2YWx1ZRgCIAEoCUIK4j8HEgV2YWx1ZVIFdmFsdWU6AjgBQgwKCmV4ZWN1dGFibGVCEAoOc - 3RvcmFnZV9jb25maWcalQUKEEluZmVyZW5jZXJDb25maWcShQEKD2luZmVyZW5jZXJfYXJncxgBIAMoCzJHLnNuYXBjaGF0LnJlc - 2VhcmNoLmdibWwuR2JtbENvbmZpZy5JbmZlcmVuY2VyQ29uZmlnLkluZmVyZW5jZXJBcmdzRW50cnlCE+I/EBIOaW5mZXJlbmNlc - kFyZ3NSDmluZmVyZW5jZXJBcmdzEkYKE2luZmVyZW5jZXJfY2xzX3BhdGgYAiABKAlCFuI/ExIRaW5mZXJlbmNlckNsc1BhdGhSE - WluZmVyZW5jZXJDbHNQYXRoEikKCGNsc19wYXRoGGQgASgJQgziPwkSB2Nsc1BhdGhIAFIHY2xzUGF0aBIoCgdjb21tYW5kGGUgA - SgJQgziPwkSB2NvbW1hbmRIAFIHY29tbWFuZBJJChRpbmZlcmVuY2VfYmF0Y2hfc2l6ZRgFIAEoDUIX4j8UEhJpbmZlcmVuY2VCY - XRjaFNpemVSEmluZmVyZW5jZUJhdGNoU2l6ZRKXAQoaZ3JhcGhfc3RvcmVfc3RvcmFnZV9jb25maWcYBiABKAsyOi5zbmFwY2hhd - C5yZXNlYXJjaC5nYm1sLkdibWxDb25maWcuR3JhcGhTdG9yZVN0b3JhZ2VDb25maWdCHOI/GRIXZ3JhcGhTdG9yZVN0b3JhZ2VDb - 25maWdIAVIXZ3JhcGhTdG9yZVN0b3JhZ2VDb25maWcaVwoTSW5mZXJlbmNlckFyZ3NFbnRyeRIaCgNrZXkYASABKAlCCOI/BRIDa - 2V5UgNrZXkSIAoFdmFsdWUYAiABKAlCCuI/BxIFdmFsdWVSBXZhbHVlOgI4AUIMCgpleGVjdXRhYmxlQhAKDnN0b3JhZ2VfY29uZ - mlnGtsCChNQb3N0UHJvY2Vzc29yQ29uZmlnEpUBChNwb3N0X3Byb2Nlc3Nvcl9hcmdzGAEgAygLMk0uc25hcGNoYXQucmVzZWFyY - 2guZ2JtbC5HYm1sQ29uZmlnLlBvc3RQcm9jZXNzb3JDb25maWcuUG9zdFByb2Nlc3NvckFyZ3NFbnRyeUIW4j8TEhFwb3N0UHJvY - 2Vzc29yQXJnc1IRcG9zdFByb2Nlc3NvckFyZ3MSUAoXcG9zdF9wcm9jZXNzb3JfY2xzX3BhdGgYAiABKAlCGeI/FhIUcG9zdFByb - 2Nlc3NvckNsc1BhdGhSFHBvc3RQcm9jZXNzb3JDbHNQYXRoGloKFlBvc3RQcm9jZXNzb3JBcmdzRW50cnkSGgoDa2V5GAEgASgJQ - gjiPwUSA2tleVIDa2V5EiAKBXZhbHVlGAIgASgJQgriPwcSBXZhbHVlUgV2YWx1ZToCOAEanAIKDU1ldHJpY3NDb25maWcSPQoQb - WV0cmljc19jbHNfcGF0aBgBIAEoCUIT4j8QEg5tZXRyaWNzQ2xzUGF0aFIObWV0cmljc0Nsc1BhdGgSdgoMbWV0cmljc19hcmdzG - AIgAygLMkEuc25hcGNoYXQucmVzZWFyY2guZ2JtbC5HYm1sQ29uZmlnLk1ldHJpY3NDb25maWcuTWV0cmljc0FyZ3NFbnRyeUIQ4 - j8NEgttZXRyaWNzQXJnc1ILbWV0cmljc0FyZ3MaVAoQTWV0cmljc0FyZ3NFbnRyeRIaCgNrZXkYASABKAlCCOI/BRIDa2V5UgNrZ - XkSIAoFdmFsdWUYAiABKAlCCuI/BxIFdmFsdWVSBXZhbHVlOgI4ARr0AgoOUHJvZmlsZXJDb25maWcSTwoWc2hvdWxkX2VuYWJsZ - V9wcm9maWxlchgBIAEoCEIZ4j8WEhRzaG91bGRFbmFibGVQcm9maWxlclIUc2hvdWxkRW5hYmxlUHJvZmlsZXISPQoQcHJvZmlsZ - XJfbG9nX2RpchgCIAEoCUIT4j8QEg5wcm9maWxlckxvZ0RpclIOcHJvZmlsZXJMb2dEaXISewoNcHJvZmlsZXJfYXJncxgDIAMoC - zJDLnNuYXBjaGF0LnJlc2VhcmNoLmdibWwuR2JtbENvbmZpZy5Qcm9maWxlckNvbmZpZy5Qcm9maWxlckFyZ3NFbnRyeUIR4j8OE - gxwcm9maWxlckFyZ3NSDHByb2ZpbGVyQXJncxpVChFQcm9maWxlckFyZ3NFbnRyeRIaCgNrZXkYASABKAlCCOI/BRIDa2V5UgNrZ - XkSIAoFdmFsdWUYAiABKAlCCuI/BxIFdmFsdWVSBXZhbHVlOgI4ARpVChFGZWF0dXJlRmxhZ3NFbnRyeRIaCgNrZXkYASABKAlCC - OI/BRIDa2V5UgNrZXkSIAoFdmFsdWUYAiABKAlCCuI/BxIFdmFsdWVSBXZhbHVlOgI4AWIGcHJvdG8z""" + G91bGRMb2dUb1RlbnNvcmJvYXJkEpcBChpncmFwaF9zdG9yZV9zdG9yYWdlX2NvbmZpZxgNIAEoCzI6LnNuYXBjaGF0LnJlc2Vhc + mNoLmdibWwuR2JtbENvbmZpZy5HcmFwaFN0b3JlU3RvcmFnZUNvbmZpZ0Ic4j8ZEhdncmFwaFN0b3JlU3RvcmFnZUNvbmZpZ0gBU + hdncmFwaFN0b3JlU3RvcmFnZUNvbmZpZxpUChBUcmFpbmVyQXJnc0VudHJ5EhoKA2tleRgBIAEoCUII4j8FEgNrZXlSA2tleRIgC + gV2YWx1ZRgCIAEoCUIK4j8HEgV2YWx1ZVIFdmFsdWU6AjgBQgwKCmV4ZWN1dGFibGVCEAoOc3RvcmFnZV9jb25maWcalQUKEEluZ + mVyZW5jZXJDb25maWcShQEKD2luZmVyZW5jZXJfYXJncxgBIAMoCzJHLnNuYXBjaGF0LnJlc2VhcmNoLmdibWwuR2JtbENvbmZpZ + y5JbmZlcmVuY2VyQ29uZmlnLkluZmVyZW5jZXJBcmdzRW50cnlCE+I/EBIOaW5mZXJlbmNlckFyZ3NSDmluZmVyZW5jZXJBcmdzE + kYKE2luZmVyZW5jZXJfY2xzX3BhdGgYAiABKAlCFuI/ExIRaW5mZXJlbmNlckNsc1BhdGhSEWluZmVyZW5jZXJDbHNQYXRoEikKC + GNsc19wYXRoGGQgASgJQgziPwkSB2Nsc1BhdGhIAFIHY2xzUGF0aBIoCgdjb21tYW5kGGUgASgJQgziPwkSB2NvbW1hbmRIAFIHY + 29tbWFuZBJJChRpbmZlcmVuY2VfYmF0Y2hfc2l6ZRgFIAEoDUIX4j8UEhJpbmZlcmVuY2VCYXRjaFNpemVSEmluZmVyZW5jZUJhd + GNoU2l6ZRKXAQoaZ3JhcGhfc3RvcmVfc3RvcmFnZV9jb25maWcYBiABKAsyOi5zbmFwY2hhdC5yZXNlYXJjaC5nYm1sLkdibWxDb + 25maWcuR3JhcGhTdG9yZVN0b3JhZ2VDb25maWdCHOI/GRIXZ3JhcGhTdG9yZVN0b3JhZ2VDb25maWdIAVIXZ3JhcGhTdG9yZVN0b + 3JhZ2VDb25maWcaVwoTSW5mZXJlbmNlckFyZ3NFbnRyeRIaCgNrZXkYASABKAlCCOI/BRIDa2V5UgNrZXkSIAoFdmFsdWUYAiABK + AlCCuI/BxIFdmFsdWVSBXZhbHVlOgI4AUIMCgpleGVjdXRhYmxlQhAKDnN0b3JhZ2VfY29uZmlnGtsCChNQb3N0UHJvY2Vzc29yQ + 29uZmlnEpUBChNwb3N0X3Byb2Nlc3Nvcl9hcmdzGAEgAygLMk0uc25hcGNoYXQucmVzZWFyY2guZ2JtbC5HYm1sQ29uZmlnLlBvc + 3RQcm9jZXNzb3JDb25maWcuUG9zdFByb2Nlc3NvckFyZ3NFbnRyeUIW4j8TEhFwb3N0UHJvY2Vzc29yQXJnc1IRcG9zdFByb2Nlc + 3NvckFyZ3MSUAoXcG9zdF9wcm9jZXNzb3JfY2xzX3BhdGgYAiABKAlCGeI/FhIUcG9zdFByb2Nlc3NvckNsc1BhdGhSFHBvc3RQc + m9jZXNzb3JDbHNQYXRoGloKFlBvc3RQcm9jZXNzb3JBcmdzRW50cnkSGgoDa2V5GAEgASgJQgjiPwUSA2tleVIDa2V5EiAKBXZhb + HVlGAIgASgJQgriPwcSBXZhbHVlUgV2YWx1ZToCOAEanAIKDU1ldHJpY3NDb25maWcSPQoQbWV0cmljc19jbHNfcGF0aBgBIAEoC + UIT4j8QEg5tZXRyaWNzQ2xzUGF0aFIObWV0cmljc0Nsc1BhdGgSdgoMbWV0cmljc19hcmdzGAIgAygLMkEuc25hcGNoYXQucmVzZ + WFyY2guZ2JtbC5HYm1sQ29uZmlnLk1ldHJpY3NDb25maWcuTWV0cmljc0FyZ3NFbnRyeUIQ4j8NEgttZXRyaWNzQXJnc1ILbWV0c + mljc0FyZ3MaVAoQTWV0cmljc0FyZ3NFbnRyeRIaCgNrZXkYASABKAlCCOI/BRIDa2V5UgNrZXkSIAoFdmFsdWUYAiABKAlCCuI/B + xIFdmFsdWVSBXZhbHVlOgI4ARr0AgoOUHJvZmlsZXJDb25maWcSTwoWc2hvdWxkX2VuYWJsZV9wcm9maWxlchgBIAEoCEIZ4j8WE + hRzaG91bGRFbmFibGVQcm9maWxlclIUc2hvdWxkRW5hYmxlUHJvZmlsZXISPQoQcHJvZmlsZXJfbG9nX2RpchgCIAEoCUIT4j8QE + g5wcm9maWxlckxvZ0RpclIOcHJvZmlsZXJMb2dEaXISewoNcHJvZmlsZXJfYXJncxgDIAMoCzJDLnNuYXBjaGF0LnJlc2VhcmNoL + mdibWwuR2JtbENvbmZpZy5Qcm9maWxlckNvbmZpZy5Qcm9maWxlckFyZ3NFbnRyeUIR4j8OEgxwcm9maWxlckFyZ3NSDHByb2Zpb + GVyQXJncxpVChFQcm9maWxlckFyZ3NFbnRyeRIaCgNrZXkYASABKAlCCOI/BRIDa2V5UgNrZXkSIAoFdmFsdWUYAiABKAlCCuI/B + xIFdmFsdWVSBXZhbHVlOgI4ARpVChFGZWF0dXJlRmxhZ3NFbnRyeRIaCgNrZXkYASABKAlCCOI/BRIDa2V5UgNrZXkSIAoFdmFsd + WUYAiABKAlCCuI/BxIFdmFsdWVSBXZhbHVlOgI4AWIGcHJvdG8z""" ).mkString) lazy val scalaDescriptor: _root_.scalapb.descriptors.FileDescriptor = { val scalaProto = com.google.protobuf.descriptor.FileDescriptorProto.parseFrom(ProtoBytes) diff --git a/scala/common/src/main/scala/snapchat/research/gbml/gigl_resource_config/GiglResourceConfigProto.scala b/scala/common/src/main/scala/snapchat/research/gbml/gigl_resource_config/GiglResourceConfigProto.scala index 94ffd417b..da5ed6523 100644 --- a/scala/common/src/main/scala/snapchat/research/gbml/gigl_resource_config/GiglResourceConfigProto.scala +++ b/scala/common/src/main/scala/snapchat/research/gbml/gigl_resource_config/GiglResourceConfigProto.scala @@ -48,7 +48,7 @@ object GiglResourceConfigProto extends _root_.scalapb.GeneratedFileObject { XQSMwoMbnVtX3JlcGxpY2FzGAUgASgNQhDiPw0SC251bVJlcGxpY2FzUgtudW1SZXBsaWNhcyJGChJMb2NhbFRyYWluZXJDb25ma WcSMAoLbnVtX3dvcmtlcnMYASABKA1CD+I/DBIKbnVtV29ya2Vyc1IKbnVtV29ya2VycyKZAQobVmVydGV4QWlSZXNlcnZhdGlvb kFmZmluaXR5Eh0KBHR5cGUYASABKAlCCeI/BhIEdHlwZVIEdHlwZRJbChpyZXNlcnZhdGlvbl9yZXNvdXJjZV9uYW1lcxgCIAMoC - UId4j8aEhhyZXNlcnZhdGlvblJlc291cmNlTmFtZXNSGHJlc2VydmF0aW9uUmVzb3VyY2VOYW1lcyKuBQoWVmVydGV4QWlSZXNvd + UId4j8aEhhyZXNlcnZhdGlvblJlc291cmNlTmFtZXNSGHJlc2VydmF0aW9uUmVzb3VyY2VOYW1lcyKOBgoWVmVydGV4QWlSZXNvd XJjZUNvbmZpZxIzCgxtYWNoaW5lX3R5cGUYASABKAlCEOI/DRILbWFjaGluZVR5cGVSC21hY2hpbmVUeXBlEicKCGdwdV90eXBlG AIgASgJQgziPwkSB2dwdVR5cGVSB2dwdVR5cGUSKgoJZ3B1X2xpbWl0GAMgASgNQg3iPwoSCGdwdUxpbWl0UghncHVMaW1pdBIzC gxudW1fcmVwbGljYXMYBCABKA1CEOI/DRILbnVtUmVwbGljYXNSC251bVJlcGxpY2FzEiYKB3RpbWVvdXQYBSABKA1CDOI/CRIHd @@ -57,75 +57,76 @@ object GiglResourceConfigProto extends _root_.scalapb.GeneratedFileObject { GluZ1N0cmF0ZWd5Ej4KEWJvb3RfZGlza19zaXplX2diGAggASgNQhPiPxASDmJvb3REaXNrU2l6ZUdiUg5ib290RGlza1NpemVHY hKAAQoUcmVzZXJ2YXRpb25fYWZmaW5pdHkYCSABKAsyMy5zbmFwY2hhdC5yZXNlYXJjaC5nYm1sLlZlcnRleEFpUmVzZXJ2YXRpb 25BZmZpbml0eUIY4j8VEhNyZXNlcnZhdGlvbkFmZmluaXR5UhNyZXNlcnZhdGlvbkFmZmluaXR5ElgKGXRlbnNvcmJvYXJkX3Jlc - 291cmNlX25hbWUYCiABKAlCHOI/GRIXdGVuc29yYm9hcmRSZXNvdXJjZU5hbWVSF3RlbnNvcmJvYXJkUmVzb3VyY2VOYW1lIooCC - hFLRlBSZXNvdXJjZUNvbmZpZxIwCgtjcHVfcmVxdWVzdBgBIAEoCUIP4j8MEgpjcHVSZXF1ZXN0UgpjcHVSZXF1ZXN0EjkKDm1lb - W9yeV9yZXF1ZXN0GAIgASgJQhLiPw8SDW1lbW9yeVJlcXVlc3RSDW1lbW9yeVJlcXVlc3QSJwoIZ3B1X3R5cGUYAyABKAlCDOI/C - RIHZ3B1VHlwZVIHZ3B1VHlwZRIqCglncHVfbGltaXQYBCABKA1CDeI/ChIIZ3B1TGltaXRSCGdwdUxpbWl0EjMKDG51bV9yZXBsa - WNhcxgFIAEoDUIQ4j8NEgtudW1SZXBsaWNhc1ILbnVtUmVwbGljYXMiRwoTTG9jYWxSZXNvdXJjZUNvbmZpZxIwCgtudW1fd29ya - 2VycxgBIAEoDUIP4j8MEgpudW1Xb3JrZXJzUgpudW1Xb3JrZXJzItkCChhWZXJ0ZXhBaUdyYXBoU3RvcmVDb25maWcSbQoQZ3Jhc - Ghfc3RvcmVfcG9vbBgBIAEoCzIuLnNuYXBjaGF0LnJlc2VhcmNoLmdibWwuVmVydGV4QWlSZXNvdXJjZUNvbmZpZ0IT4j8QEg5nc - mFwaFN0b3JlUG9vbFIOZ3JhcGhTdG9yZVBvb2wSYwoMY29tcHV0ZV9wb29sGAIgASgLMi4uc25hcGNoYXQucmVzZWFyY2guZ2Jtb - C5WZXJ0ZXhBaVJlc291cmNlQ29uZmlnQhDiPw0SC2NvbXB1dGVQb29sUgtjb21wdXRlUG9vbBJpCiBjb21wdXRlX2NsdXN0ZXJfb - G9jYWxfd29ybGRfc2l6ZRgDIAEoBUIh4j8eEhxjb21wdXRlQ2x1c3RlckxvY2FsV29ybGRTaXplUhxjb21wdXRlQ2x1c3RlckxvY - 2FsV29ybGRTaXplIp0DChhEaXN0cmlidXRlZFRyYWluZXJDb25maWcShAEKGHZlcnRleF9haV90cmFpbmVyX2NvbmZpZxgBIAEoC - zItLnNuYXBjaGF0LnJlc2VhcmNoLmdibWwuVmVydGV4QWlUcmFpbmVyQ29uZmlnQhriPxcSFXZlcnRleEFpVHJhaW5lckNvbmZpZ - 0gAUhV2ZXJ0ZXhBaVRyYWluZXJDb25maWcSbwoSa2ZwX3RyYWluZXJfY29uZmlnGAIgASgLMiguc25hcGNoYXQucmVzZWFyY2guZ - 2JtbC5LRlBUcmFpbmVyQ29uZmlnQhXiPxISEGtmcFRyYWluZXJDb25maWdIAFIQa2ZwVHJhaW5lckNvbmZpZxJ3ChRsb2NhbF90c - mFpbmVyX2NvbmZpZxgDIAEoCzIqLnNuYXBjaGF0LnJlc2VhcmNoLmdibWwuTG9jYWxUcmFpbmVyQ29uZmlnQhfiPxQSEmxvY2FsV - HJhaW5lckNvbmZpZ0gAUhJsb2NhbFRyYWluZXJDb25maWdCEAoOdHJhaW5lcl9jb25maWcixwQKFVRyYWluZXJSZXNvdXJjZUNvb - mZpZxKFAQoYdmVydGV4X2FpX3RyYWluZXJfY29uZmlnGAEgASgLMi4uc25hcGNoYXQucmVzZWFyY2guZ2JtbC5WZXJ0ZXhBaVJlc - 291cmNlQ29uZmlnQhriPxcSFXZlcnRleEFpVHJhaW5lckNvbmZpZ0gAUhV2ZXJ0ZXhBaVRyYWluZXJDb25maWcScAoSa2ZwX3RyY - WluZXJfY29uZmlnGAIgASgLMikuc25hcGNoYXQucmVzZWFyY2guZ2JtbC5LRlBSZXNvdXJjZUNvbmZpZ0IV4j8SEhBrZnBUcmFpb - mVyQ29uZmlnSABSEGtmcFRyYWluZXJDb25maWcSeAoUbG9jYWxfdHJhaW5lcl9jb25maWcYAyABKAsyKy5zbmFwY2hhdC5yZXNlY - XJjaC5nYm1sLkxvY2FsUmVzb3VyY2VDb25maWdCF+I/FBISbG9jYWxUcmFpbmVyQ29uZmlnSABSEmxvY2FsVHJhaW5lckNvbmZpZ - xKnAQokdmVydGV4X2FpX2dyYXBoX3N0b3JlX3RyYWluZXJfY29uZmlnGAQgASgLMjAuc25hcGNoYXQucmVzZWFyY2guZ2JtbC5WZ - XJ0ZXhBaUdyYXBoU3RvcmVDb25maWdCJOI/IRIfdmVydGV4QWlHcmFwaFN0b3JlVHJhaW5lckNvbmZpZ0gAUh92ZXJ0ZXhBaUdyY - XBoU3RvcmVUcmFpbmVyQ29uZmlnQhAKDnRyYWluZXJfY29uZmlnIocFChhJbmZlcmVuY2VyUmVzb3VyY2VDb25maWcSjgEKG3Zlc - nRleF9haV9pbmZlcmVuY2VyX2NvbmZpZxgBIAEoCzIuLnNuYXBjaGF0LnJlc2VhcmNoLmdibWwuVmVydGV4QWlSZXNvdXJjZUNvb - mZpZ0Id4j8aEhh2ZXJ0ZXhBaUluZmVyZW5jZXJDb25maWdIAFIYdmVydGV4QWlJbmZlcmVuY2VyQ29uZmlnEo0BChpkYXRhZmxvd - 19pbmZlcmVuY2VyX2NvbmZpZxgCIAEoCzIuLnNuYXBjaGF0LnJlc2VhcmNoLmdibWwuRGF0YWZsb3dSZXNvdXJjZUNvbmZpZ0Id4 - j8aEhhkYXRhZmxvd0luZmVyZW5jZXJDb25maWdIAFIYZGF0YWZsb3dJbmZlcmVuY2VyQ29uZmlnEoEBChdsb2NhbF9pbmZlcmVuY - 2VyX2NvbmZpZxgDIAEoCzIrLnNuYXBjaGF0LnJlc2VhcmNoLmdibWwuTG9jYWxSZXNvdXJjZUNvbmZpZ0Ia4j8XEhVsb2NhbEluZ - mVyZW5jZXJDb25maWdIAFIVbG9jYWxJbmZlcmVuY2VyQ29uZmlnErABCid2ZXJ0ZXhfYWlfZ3JhcGhfc3RvcmVfaW5mZXJlbmNlc - l9jb25maWcYBCABKAsyMC5zbmFwY2hhdC5yZXNlYXJjaC5nYm1sLlZlcnRleEFpR3JhcGhTdG9yZUNvbmZpZ0In4j8kEiJ2ZXJ0Z - XhBaUdyYXBoU3RvcmVJbmZlcmVuY2VyQ29uZmlnSABSInZlcnRleEFpR3JhcGhTdG9yZUluZmVyZW5jZXJDb25maWdCEwoRaW5mZ - XJlbmNlcl9jb25maWcilwgKFFNoYXJlZFJlc291cmNlQ29uZmlnEn4KD3Jlc291cmNlX2xhYmVscxgBIAMoCzJALnNuYXBjaGF0L - nJlc2VhcmNoLmdibWwuU2hhcmVkUmVzb3VyY2VDb25maWcuUmVzb3VyY2VMYWJlbHNFbnRyeUIT4j8QEg5yZXNvdXJjZUxhYmVsc - 1IOcmVzb3VyY2VMYWJlbHMSjgEKFWNvbW1vbl9jb21wdXRlX2NvbmZpZxgCIAEoCzJALnNuYXBjaGF0LnJlc2VhcmNoLmdibWwuU - 2hhcmVkUmVzb3VyY2VDb25maWcuQ29tbW9uQ29tcHV0ZUNvbmZpZ0IY4j8VEhNjb21tb25Db21wdXRlQ29uZmlnUhNjb21tb25Db - 21wdXRlQ29uZmlnGpQFChNDb21tb25Db21wdXRlQ29uZmlnEiYKB3Byb2plY3QYASABKAlCDOI/CRIHcHJvamVjdFIHcHJvamVjd - BIjCgZyZWdpb24YAiABKAlCC+I/CBIGcmVnaW9uUgZyZWdpb24SQwoSdGVtcF9hc3NldHNfYnVja2V0GAMgASgJQhXiPxISEHRlb - XBBc3NldHNCdWNrZXRSEHRlbXBBc3NldHNCdWNrZXQSXAobdGVtcF9yZWdpb25hbF9hc3NldHNfYnVja2V0GAQgASgJQh3iPxoSG - HRlbXBSZWdpb25hbEFzc2V0c0J1Y2tldFIYdGVtcFJlZ2lvbmFsQXNzZXRzQnVja2V0EkMKEnBlcm1fYXNzZXRzX2J1Y2tldBgFI - AEoCUIV4j8SEhBwZXJtQXNzZXRzQnVja2V0UhBwZXJtQXNzZXRzQnVja2V0EloKG3RlbXBfYXNzZXRzX2JxX2RhdGFzZXRfbmFtZ - RgGIAEoCUIc4j8ZEhd0ZW1wQXNzZXRzQnFEYXRhc2V0TmFtZVIXdGVtcEFzc2V0c0JxRGF0YXNldE5hbWUSVgoZZW1iZWRkaW5nX - 2JxX2RhdGFzZXRfbmFtZRgHIAEoCUIb4j8YEhZlbWJlZGRpbmdCcURhdGFzZXROYW1lUhZlbWJlZGRpbmdCcURhdGFzZXROYW1lE - lYKGWdjcF9zZXJ2aWNlX2FjY291bnRfZW1haWwYCCABKAlCG+I/GBIWZ2NwU2VydmljZUFjY291bnRFbWFpbFIWZ2NwU2VydmljZ - UFjY291bnRFbWFpbBI8Cg9kYXRhZmxvd19ydW5uZXIYCyABKAlCE+I/EBIOZGF0YWZsb3dSdW5uZXJSDmRhdGFmbG93UnVubmVyG - lcKE1Jlc291cmNlTGFiZWxzRW50cnkSGgoDa2V5GAEgASgJQgjiPwUSA2tleVIDa2V5EiAKBXZhbHVlGAIgASgJQgriPwcSBXZhb - HVlUgV2YWx1ZToCOAEi9wgKEkdpZ2xSZXNvdXJjZUNvbmZpZxJbChpzaGFyZWRfcmVzb3VyY2VfY29uZmlnX3VyaRgBIAEoCUIc4 - j8ZEhdzaGFyZWRSZXNvdXJjZUNvbmZpZ1VyaUgAUhdzaGFyZWRSZXNvdXJjZUNvbmZpZ1VyaRJ/ChZzaGFyZWRfcmVzb3VyY2VfY - 29uZmlnGAIgASgLMiwuc25hcGNoYXQucmVzZWFyY2guZ2JtbC5TaGFyZWRSZXNvdXJjZUNvbmZpZ0IZ4j8WEhRzaGFyZWRSZXNvd - XJjZUNvbmZpZ0gAUhRzaGFyZWRSZXNvdXJjZUNvbmZpZxJ4ChNwcmVwcm9jZXNzb3JfY29uZmlnGAwgASgLMi4uc25hcGNoYXQuc - mVzZWFyY2guZ2JtbC5EYXRhUHJlcHJvY2Vzc29yQ29uZmlnQhfiPxQSEnByZXByb2Nlc3NvckNvbmZpZ1IScHJlcHJvY2Vzc29yQ - 29uZmlnEn8KF3N1YmdyYXBoX3NhbXBsZXJfY29uZmlnGA0gASgLMisuc25hcGNoYXQucmVzZWFyY2guZ2JtbC5TcGFya1Jlc291c - mNlQ29uZmlnQhriPxcSFXN1YmdyYXBoU2FtcGxlckNvbmZpZ1IVc3ViZ3JhcGhTYW1wbGVyQ29uZmlnEnwKFnNwbGl0X2dlbmVyY - XRvcl9jb25maWcYDiABKAsyKy5zbmFwY2hhdC5yZXNlYXJjaC5nYm1sLlNwYXJrUmVzb3VyY2VDb25maWdCGeI/FhIUc3BsaXRHZ - W5lcmF0b3JDb25maWdSFHNwbGl0R2VuZXJhdG9yQ29uZmlnEm0KDnRyYWluZXJfY29uZmlnGA8gASgLMjAuc25hcGNoYXQucmVzZ - WFyY2guZ2JtbC5EaXN0cmlidXRlZFRyYWluZXJDb25maWdCFBgB4j8PEg10cmFpbmVyQ29uZmlnUg10cmFpbmVyQ29uZmlnEnQKE - WluZmVyZW5jZXJfY29uZmlnGBAgASgLMi4uc25hcGNoYXQucmVzZWFyY2guZ2JtbC5EYXRhZmxvd1Jlc291cmNlQ29uZmlnQhcYA - eI/EhIQaW5mZXJlbmNlckNvbmZpZ1IQaW5mZXJlbmNlckNvbmZpZxKBAQoXdHJhaW5lcl9yZXNvdXJjZV9jb25maWcYESABKAsyL - S5zbmFwY2hhdC5yZXNlYXJjaC5nYm1sLlRyYWluZXJSZXNvdXJjZUNvbmZpZ0Ia4j8XEhV0cmFpbmVyUmVzb3VyY2VDb25maWdSF - XRyYWluZXJSZXNvdXJjZUNvbmZpZxKNAQoaaW5mZXJlbmNlcl9yZXNvdXJjZV9jb25maWcYEiABKAsyMC5zbmFwY2hhdC5yZXNlY - XJjaC5nYm1sLkluZmVyZW5jZXJSZXNvdXJjZUNvbmZpZ0Id4j8aEhhpbmZlcmVuY2VyUmVzb3VyY2VDb25maWdSGGluZmVyZW5jZ - XJSZXNvdXJjZUNvbmZpZ0IRCg9zaGFyZWRfcmVzb3VyY2Uq4wMKCUNvbXBvbmVudBItChFDb21wb25lbnRfVW5rbm93bhAAGhbiP - xMSEUNvbXBvbmVudF9Vbmtub3duEj8KGkNvbXBvbmVudF9Db25maWdfVmFsaWRhdG9yEAEaH+I/HBIaQ29tcG9uZW50X0NvbmZpZ - 19WYWxpZGF0b3ISPwoaQ29tcG9uZW50X0NvbmZpZ19Qb3B1bGF0b3IQAhof4j8cEhpDb21wb25lbnRfQ29uZmlnX1BvcHVsYXRvc - hJBChtDb21wb25lbnRfRGF0YV9QcmVwcm9jZXNzb3IQAxog4j8dEhtDb21wb25lbnRfRGF0YV9QcmVwcm9jZXNzb3ISPwoaQ29tc - G9uZW50X1N1YmdyYXBoX1NhbXBsZXIQBBof4j8cEhpDb21wb25lbnRfU3ViZ3JhcGhfU2FtcGxlchI9ChlDb21wb25lbnRfU3Bsa - XRfR2VuZXJhdG9yEAUaHuI/GxIZQ29tcG9uZW50X1NwbGl0X0dlbmVyYXRvchItChFDb21wb25lbnRfVHJhaW5lchAGGhbiPxMSE - UNvbXBvbmVudF9UcmFpbmVyEjMKFENvbXBvbmVudF9JbmZlcmVuY2VyEAcaGeI/FhIUQ29tcG9uZW50X0luZmVyZW5jZXJiBnByb - 3RvMw==""" + 291cmNlX25hbWUYCiABKAlCHOI/GRIXdGVuc29yYm9hcmRSZXNvdXJjZU5hbWVSF3RlbnNvcmJvYXJkUmVzb3VyY2VOYW1lEl4KG + 3RlbnNvcmJvYXJkX2V4cGVyaW1lbnRfbmFtZRgLIAEoCUIe4j8bEhl0ZW5zb3Jib2FyZEV4cGVyaW1lbnROYW1lUhl0ZW5zb3Jib + 2FyZEV4cGVyaW1lbnROYW1lIooCChFLRlBSZXNvdXJjZUNvbmZpZxIwCgtjcHVfcmVxdWVzdBgBIAEoCUIP4j8MEgpjcHVSZXF1Z + XN0UgpjcHVSZXF1ZXN0EjkKDm1lbW9yeV9yZXF1ZXN0GAIgASgJQhLiPw8SDW1lbW9yeVJlcXVlc3RSDW1lbW9yeVJlcXVlc3QSJ + woIZ3B1X3R5cGUYAyABKAlCDOI/CRIHZ3B1VHlwZVIHZ3B1VHlwZRIqCglncHVfbGltaXQYBCABKA1CDeI/ChIIZ3B1TGltaXRSC + GdwdUxpbWl0EjMKDG51bV9yZXBsaWNhcxgFIAEoDUIQ4j8NEgtudW1SZXBsaWNhc1ILbnVtUmVwbGljYXMiRwoTTG9jYWxSZXNvd + XJjZUNvbmZpZxIwCgtudW1fd29ya2VycxgBIAEoDUIP4j8MEgpudW1Xb3JrZXJzUgpudW1Xb3JrZXJzItkCChhWZXJ0ZXhBaUdyY + XBoU3RvcmVDb25maWcSbQoQZ3JhcGhfc3RvcmVfcG9vbBgBIAEoCzIuLnNuYXBjaGF0LnJlc2VhcmNoLmdibWwuVmVydGV4QWlSZ + XNvdXJjZUNvbmZpZ0IT4j8QEg5ncmFwaFN0b3JlUG9vbFIOZ3JhcGhTdG9yZVBvb2wSYwoMY29tcHV0ZV9wb29sGAIgASgLMi4uc + 25hcGNoYXQucmVzZWFyY2guZ2JtbC5WZXJ0ZXhBaVJlc291cmNlQ29uZmlnQhDiPw0SC2NvbXB1dGVQb29sUgtjb21wdXRlUG9vb + BJpCiBjb21wdXRlX2NsdXN0ZXJfbG9jYWxfd29ybGRfc2l6ZRgDIAEoBUIh4j8eEhxjb21wdXRlQ2x1c3RlckxvY2FsV29ybGRTa + XplUhxjb21wdXRlQ2x1c3RlckxvY2FsV29ybGRTaXplIp0DChhEaXN0cmlidXRlZFRyYWluZXJDb25maWcShAEKGHZlcnRleF9ha + V90cmFpbmVyX2NvbmZpZxgBIAEoCzItLnNuYXBjaGF0LnJlc2VhcmNoLmdibWwuVmVydGV4QWlUcmFpbmVyQ29uZmlnQhriPxcSF + XZlcnRleEFpVHJhaW5lckNvbmZpZ0gAUhV2ZXJ0ZXhBaVRyYWluZXJDb25maWcSbwoSa2ZwX3RyYWluZXJfY29uZmlnGAIgASgLM + iguc25hcGNoYXQucmVzZWFyY2guZ2JtbC5LRlBUcmFpbmVyQ29uZmlnQhXiPxISEGtmcFRyYWluZXJDb25maWdIAFIQa2ZwVHJha + W5lckNvbmZpZxJ3ChRsb2NhbF90cmFpbmVyX2NvbmZpZxgDIAEoCzIqLnNuYXBjaGF0LnJlc2VhcmNoLmdibWwuTG9jYWxUcmFpb + mVyQ29uZmlnQhfiPxQSEmxvY2FsVHJhaW5lckNvbmZpZ0gAUhJsb2NhbFRyYWluZXJDb25maWdCEAoOdHJhaW5lcl9jb25maWcix + wQKFVRyYWluZXJSZXNvdXJjZUNvbmZpZxKFAQoYdmVydGV4X2FpX3RyYWluZXJfY29uZmlnGAEgASgLMi4uc25hcGNoYXQucmVzZ + WFyY2guZ2JtbC5WZXJ0ZXhBaVJlc291cmNlQ29uZmlnQhriPxcSFXZlcnRleEFpVHJhaW5lckNvbmZpZ0gAUhV2ZXJ0ZXhBaVRyY + WluZXJDb25maWcScAoSa2ZwX3RyYWluZXJfY29uZmlnGAIgASgLMikuc25hcGNoYXQucmVzZWFyY2guZ2JtbC5LRlBSZXNvdXJjZ + UNvbmZpZ0IV4j8SEhBrZnBUcmFpbmVyQ29uZmlnSABSEGtmcFRyYWluZXJDb25maWcSeAoUbG9jYWxfdHJhaW5lcl9jb25maWcYA + yABKAsyKy5zbmFwY2hhdC5yZXNlYXJjaC5nYm1sLkxvY2FsUmVzb3VyY2VDb25maWdCF+I/FBISbG9jYWxUcmFpbmVyQ29uZmlnS + ABSEmxvY2FsVHJhaW5lckNvbmZpZxKnAQokdmVydGV4X2FpX2dyYXBoX3N0b3JlX3RyYWluZXJfY29uZmlnGAQgASgLMjAuc25hc + GNoYXQucmVzZWFyY2guZ2JtbC5WZXJ0ZXhBaUdyYXBoU3RvcmVDb25maWdCJOI/IRIfdmVydGV4QWlHcmFwaFN0b3JlVHJhaW5lc + kNvbmZpZ0gAUh92ZXJ0ZXhBaUdyYXBoU3RvcmVUcmFpbmVyQ29uZmlnQhAKDnRyYWluZXJfY29uZmlnIocFChhJbmZlcmVuY2VyU + mVzb3VyY2VDb25maWcSjgEKG3ZlcnRleF9haV9pbmZlcmVuY2VyX2NvbmZpZxgBIAEoCzIuLnNuYXBjaGF0LnJlc2VhcmNoLmdib + WwuVmVydGV4QWlSZXNvdXJjZUNvbmZpZ0Id4j8aEhh2ZXJ0ZXhBaUluZmVyZW5jZXJDb25maWdIAFIYdmVydGV4QWlJbmZlcmVuY + 2VyQ29uZmlnEo0BChpkYXRhZmxvd19pbmZlcmVuY2VyX2NvbmZpZxgCIAEoCzIuLnNuYXBjaGF0LnJlc2VhcmNoLmdibWwuRGF0Y + WZsb3dSZXNvdXJjZUNvbmZpZ0Id4j8aEhhkYXRhZmxvd0luZmVyZW5jZXJDb25maWdIAFIYZGF0YWZsb3dJbmZlcmVuY2VyQ29uZ + mlnEoEBChdsb2NhbF9pbmZlcmVuY2VyX2NvbmZpZxgDIAEoCzIrLnNuYXBjaGF0LnJlc2VhcmNoLmdibWwuTG9jYWxSZXNvdXJjZ + UNvbmZpZ0Ia4j8XEhVsb2NhbEluZmVyZW5jZXJDb25maWdIAFIVbG9jYWxJbmZlcmVuY2VyQ29uZmlnErABCid2ZXJ0ZXhfYWlfZ + 3JhcGhfc3RvcmVfaW5mZXJlbmNlcl9jb25maWcYBCABKAsyMC5zbmFwY2hhdC5yZXNlYXJjaC5nYm1sLlZlcnRleEFpR3JhcGhTd + G9yZUNvbmZpZ0In4j8kEiJ2ZXJ0ZXhBaUdyYXBoU3RvcmVJbmZlcmVuY2VyQ29uZmlnSABSInZlcnRleEFpR3JhcGhTdG9yZUluZ + mVyZW5jZXJDb25maWdCEwoRaW5mZXJlbmNlcl9jb25maWcilwgKFFNoYXJlZFJlc291cmNlQ29uZmlnEn4KD3Jlc291cmNlX2xhY + mVscxgBIAMoCzJALnNuYXBjaGF0LnJlc2VhcmNoLmdibWwuU2hhcmVkUmVzb3VyY2VDb25maWcuUmVzb3VyY2VMYWJlbHNFbnRye + UIT4j8QEg5yZXNvdXJjZUxhYmVsc1IOcmVzb3VyY2VMYWJlbHMSjgEKFWNvbW1vbl9jb21wdXRlX2NvbmZpZxgCIAEoCzJALnNuY + XBjaGF0LnJlc2VhcmNoLmdibWwuU2hhcmVkUmVzb3VyY2VDb25maWcuQ29tbW9uQ29tcHV0ZUNvbmZpZ0IY4j8VEhNjb21tb25Db + 21wdXRlQ29uZmlnUhNjb21tb25Db21wdXRlQ29uZmlnGpQFChNDb21tb25Db21wdXRlQ29uZmlnEiYKB3Byb2plY3QYASABKAlCD + OI/CRIHcHJvamVjdFIHcHJvamVjdBIjCgZyZWdpb24YAiABKAlCC+I/CBIGcmVnaW9uUgZyZWdpb24SQwoSdGVtcF9hc3NldHNfY + nVja2V0GAMgASgJQhXiPxISEHRlbXBBc3NldHNCdWNrZXRSEHRlbXBBc3NldHNCdWNrZXQSXAobdGVtcF9yZWdpb25hbF9hc3Nld + HNfYnVja2V0GAQgASgJQh3iPxoSGHRlbXBSZWdpb25hbEFzc2V0c0J1Y2tldFIYdGVtcFJlZ2lvbmFsQXNzZXRzQnVja2V0EkMKE + nBlcm1fYXNzZXRzX2J1Y2tldBgFIAEoCUIV4j8SEhBwZXJtQXNzZXRzQnVja2V0UhBwZXJtQXNzZXRzQnVja2V0EloKG3RlbXBfY + XNzZXRzX2JxX2RhdGFzZXRfbmFtZRgGIAEoCUIc4j8ZEhd0ZW1wQXNzZXRzQnFEYXRhc2V0TmFtZVIXdGVtcEFzc2V0c0JxRGF0Y + XNldE5hbWUSVgoZZW1iZWRkaW5nX2JxX2RhdGFzZXRfbmFtZRgHIAEoCUIb4j8YEhZlbWJlZGRpbmdCcURhdGFzZXROYW1lUhZlb + WJlZGRpbmdCcURhdGFzZXROYW1lElYKGWdjcF9zZXJ2aWNlX2FjY291bnRfZW1haWwYCCABKAlCG+I/GBIWZ2NwU2VydmljZUFjY + 291bnRFbWFpbFIWZ2NwU2VydmljZUFjY291bnRFbWFpbBI8Cg9kYXRhZmxvd19ydW5uZXIYCyABKAlCE+I/EBIOZGF0YWZsb3dSd + W5uZXJSDmRhdGFmbG93UnVubmVyGlcKE1Jlc291cmNlTGFiZWxzRW50cnkSGgoDa2V5GAEgASgJQgjiPwUSA2tleVIDa2V5EiAKB + XZhbHVlGAIgASgJQgriPwcSBXZhbHVlUgV2YWx1ZToCOAEi9wgKEkdpZ2xSZXNvdXJjZUNvbmZpZxJbChpzaGFyZWRfcmVzb3VyY + 2VfY29uZmlnX3VyaRgBIAEoCUIc4j8ZEhdzaGFyZWRSZXNvdXJjZUNvbmZpZ1VyaUgAUhdzaGFyZWRSZXNvdXJjZUNvbmZpZ1Vya + RJ/ChZzaGFyZWRfcmVzb3VyY2VfY29uZmlnGAIgASgLMiwuc25hcGNoYXQucmVzZWFyY2guZ2JtbC5TaGFyZWRSZXNvdXJjZUNvb + mZpZ0IZ4j8WEhRzaGFyZWRSZXNvdXJjZUNvbmZpZ0gAUhRzaGFyZWRSZXNvdXJjZUNvbmZpZxJ4ChNwcmVwcm9jZXNzb3JfY29uZ + mlnGAwgASgLMi4uc25hcGNoYXQucmVzZWFyY2guZ2JtbC5EYXRhUHJlcHJvY2Vzc29yQ29uZmlnQhfiPxQSEnByZXByb2Nlc3Nvc + kNvbmZpZ1IScHJlcHJvY2Vzc29yQ29uZmlnEn8KF3N1YmdyYXBoX3NhbXBsZXJfY29uZmlnGA0gASgLMisuc25hcGNoYXQucmVzZ + WFyY2guZ2JtbC5TcGFya1Jlc291cmNlQ29uZmlnQhriPxcSFXN1YmdyYXBoU2FtcGxlckNvbmZpZ1IVc3ViZ3JhcGhTYW1wbGVyQ + 29uZmlnEnwKFnNwbGl0X2dlbmVyYXRvcl9jb25maWcYDiABKAsyKy5zbmFwY2hhdC5yZXNlYXJjaC5nYm1sLlNwYXJrUmVzb3VyY + 2VDb25maWdCGeI/FhIUc3BsaXRHZW5lcmF0b3JDb25maWdSFHNwbGl0R2VuZXJhdG9yQ29uZmlnEm0KDnRyYWluZXJfY29uZmlnG + A8gASgLMjAuc25hcGNoYXQucmVzZWFyY2guZ2JtbC5EaXN0cmlidXRlZFRyYWluZXJDb25maWdCFBgB4j8PEg10cmFpbmVyQ29uZ + mlnUg10cmFpbmVyQ29uZmlnEnQKEWluZmVyZW5jZXJfY29uZmlnGBAgASgLMi4uc25hcGNoYXQucmVzZWFyY2guZ2JtbC5EYXRhZ + mxvd1Jlc291cmNlQ29uZmlnQhcYAeI/EhIQaW5mZXJlbmNlckNvbmZpZ1IQaW5mZXJlbmNlckNvbmZpZxKBAQoXdHJhaW5lcl9yZ + XNvdXJjZV9jb25maWcYESABKAsyLS5zbmFwY2hhdC5yZXNlYXJjaC5nYm1sLlRyYWluZXJSZXNvdXJjZUNvbmZpZ0Ia4j8XEhV0c + mFpbmVyUmVzb3VyY2VDb25maWdSFXRyYWluZXJSZXNvdXJjZUNvbmZpZxKNAQoaaW5mZXJlbmNlcl9yZXNvdXJjZV9jb25maWcYE + iABKAsyMC5zbmFwY2hhdC5yZXNlYXJjaC5nYm1sLkluZmVyZW5jZXJSZXNvdXJjZUNvbmZpZ0Id4j8aEhhpbmZlcmVuY2VyUmVzb + 3VyY2VDb25maWdSGGluZmVyZW5jZXJSZXNvdXJjZUNvbmZpZ0IRCg9zaGFyZWRfcmVzb3VyY2Uq4wMKCUNvbXBvbmVudBItChFDb + 21wb25lbnRfVW5rbm93bhAAGhbiPxMSEUNvbXBvbmVudF9Vbmtub3duEj8KGkNvbXBvbmVudF9Db25maWdfVmFsaWRhdG9yEAEaH + +I/HBIaQ29tcG9uZW50X0NvbmZpZ19WYWxpZGF0b3ISPwoaQ29tcG9uZW50X0NvbmZpZ19Qb3B1bGF0b3IQAhof4j8cEhpDb21wb + 25lbnRfQ29uZmlnX1BvcHVsYXRvchJBChtDb21wb25lbnRfRGF0YV9QcmVwcm9jZXNzb3IQAxog4j8dEhtDb21wb25lbnRfRGF0Y + V9QcmVwcm9jZXNzb3ISPwoaQ29tcG9uZW50X1N1YmdyYXBoX1NhbXBsZXIQBBof4j8cEhpDb21wb25lbnRfU3ViZ3JhcGhfU2Ftc + GxlchI9ChlDb21wb25lbnRfU3BsaXRfR2VuZXJhdG9yEAUaHuI/GxIZQ29tcG9uZW50X1NwbGl0X0dlbmVyYXRvchItChFDb21wb + 25lbnRfVHJhaW5lchAGGhbiPxMSEUNvbXBvbmVudF9UcmFpbmVyEjMKFENvbXBvbmVudF9JbmZlcmVuY2VyEAcaGeI/FhIUQ29tc + G9uZW50X0luZmVyZW5jZXJiBnByb3RvMw==""" ).mkString) lazy val scalaDescriptor: _root_.scalapb.descriptors.FileDescriptor = { val scalaProto = com.google.protobuf.descriptor.FileDescriptorProto.parseFrom(ProtoBytes) diff --git a/scala/common/src/main/scala/snapchat/research/gbml/gigl_resource_config/VertexAiResourceConfig.scala b/scala/common/src/main/scala/snapchat/research/gbml/gigl_resource_config/VertexAiResourceConfig.scala index 8a29093bb..d863014af 100644 --- a/scala/common/src/main/scala/snapchat/research/gbml/gigl_resource_config/VertexAiResourceConfig.scala +++ b/scala/common/src/main/scala/snapchat/research/gbml/gigl_resource_config/VertexAiResourceConfig.scala @@ -41,6 +41,15 @@ package snapchat.research.gbml.gigl_resource_config * Format: projects/{project}/locations/{region}/tensorboards/{tensorboard_id} * See https://cloud.google.com/vertex-ai/docs/experiments/tensorboard-overview * for the Tensorboard data model. + * @param tensorboardExperimentName + * Optional. When set, the trainer's chief rank streams events to a + * TensorboardExperiment with this name on the TB resource above, in + * addition to Vertex's per-job auto-upload. Multiple jobs that share this + * value land in the same TensorboardExperiment, so they appear as + * comparable runs on one TensorBoard page. Requires + * tensorboard_resource_name above to be set. Allowed characters: + * lowercase letters, digits, hyphens (Vertex AI Experiment ID rules). + * See https://cloud.google.com/vertex-ai/docs/experiments/tensorboard-overview. */ @SerialVersionUID(0L) final case class VertexAiResourceConfig( @@ -54,6 +63,7 @@ final case class VertexAiResourceConfig( bootDiskSizeGb: _root_.scala.Int = 0, reservationAffinity: _root_.scala.Option[snapchat.research.gbml.gigl_resource_config.VertexAiReservationAffinity] = _root_.scala.None, tensorboardResourceName: _root_.scala.Predef.String = "", + tensorboardExperimentName: _root_.scala.Predef.String = "", unknownFields: _root_.scalapb.UnknownFieldSet = _root_.scalapb.UnknownFieldSet.empty ) extends scalapb.GeneratedMessage with scalapb.lenses.Updatable[VertexAiResourceConfig] { @transient @@ -127,6 +137,13 @@ final case class VertexAiResourceConfig( __size += _root_.com.google.protobuf.CodedOutputStream.computeStringSize(10, __value) } }; + + { + val __value = tensorboardExperimentName + if (!__value.isEmpty) { + __size += _root_.com.google.protobuf.CodedOutputStream.computeStringSize(11, __value) + } + }; __size += unknownFields.serializedSize __size } @@ -200,6 +217,12 @@ final case class VertexAiResourceConfig( _output__.writeString(10, __v) } }; + { + val __v = tensorboardExperimentName + if (!__v.isEmpty) { + _output__.writeString(11, __v) + } + }; unknownFields.writeTo(_output__) } def withMachineType(__v: _root_.scala.Predef.String): VertexAiResourceConfig = copy(machineType = __v) @@ -214,6 +237,7 @@ final case class VertexAiResourceConfig( def clearReservationAffinity: VertexAiResourceConfig = copy(reservationAffinity = _root_.scala.None) def withReservationAffinity(__v: snapchat.research.gbml.gigl_resource_config.VertexAiReservationAffinity): VertexAiResourceConfig = copy(reservationAffinity = Option(__v)) def withTensorboardResourceName(__v: _root_.scala.Predef.String): VertexAiResourceConfig = copy(tensorboardResourceName = __v) + def withTensorboardExperimentName(__v: _root_.scala.Predef.String): VertexAiResourceConfig = copy(tensorboardExperimentName = __v) def withUnknownFields(__v: _root_.scalapb.UnknownFieldSet) = copy(unknownFields = __v) def discardUnknownFields = copy(unknownFields = _root_.scalapb.UnknownFieldSet.empty) def getFieldByNumber(__fieldNumber: _root_.scala.Int): _root_.scala.Any = { @@ -255,6 +279,10 @@ final case class VertexAiResourceConfig( val __t = tensorboardResourceName if (__t != "") __t else null } + case 11 => { + val __t = tensorboardExperimentName + if (__t != "") __t else null + } } } def getField(__field: _root_.scalapb.descriptors.FieldDescriptor): _root_.scalapb.descriptors.PValue = { @@ -270,6 +298,7 @@ final case class VertexAiResourceConfig( case 8 => _root_.scalapb.descriptors.PInt(bootDiskSizeGb) case 9 => reservationAffinity.map(_.toPMessage).getOrElse(_root_.scalapb.descriptors.PEmpty) case 10 => _root_.scalapb.descriptors.PString(tensorboardResourceName) + case 11 => _root_.scalapb.descriptors.PString(tensorboardExperimentName) } } def toProtoString: _root_.scala.Predef.String = _root_.scalapb.TextFormat.printToUnicodeString(this) @@ -290,6 +319,7 @@ object VertexAiResourceConfig extends scalapb.GeneratedMessageCompanion[snapchat var __bootDiskSizeGb: _root_.scala.Int = 0 var __reservationAffinity: _root_.scala.Option[snapchat.research.gbml.gigl_resource_config.VertexAiReservationAffinity] = _root_.scala.None var __tensorboardResourceName: _root_.scala.Predef.String = "" + var __tensorboardExperimentName: _root_.scala.Predef.String = "" var `_unknownFields__`: _root_.scalapb.UnknownFieldSet.Builder = null var _done__ = false while (!_done__) { @@ -316,6 +346,8 @@ object VertexAiResourceConfig extends scalapb.GeneratedMessageCompanion[snapchat __reservationAffinity = Option(__reservationAffinity.fold(_root_.scalapb.LiteParser.readMessage[snapchat.research.gbml.gigl_resource_config.VertexAiReservationAffinity](_input__))(_root_.scalapb.LiteParser.readMessage(_input__, _))) case 82 => __tensorboardResourceName = _input__.readStringRequireUtf8() + case 90 => + __tensorboardExperimentName = _input__.readStringRequireUtf8() case tag => if (_unknownFields__ == null) { _unknownFields__ = new _root_.scalapb.UnknownFieldSet.Builder() @@ -334,6 +366,7 @@ object VertexAiResourceConfig extends scalapb.GeneratedMessageCompanion[snapchat bootDiskSizeGb = __bootDiskSizeGb, reservationAffinity = __reservationAffinity, tensorboardResourceName = __tensorboardResourceName, + tensorboardExperimentName = __tensorboardExperimentName, unknownFields = if (_unknownFields__ == null) _root_.scalapb.UnknownFieldSet.empty else _unknownFields__.result() ) } @@ -350,7 +383,8 @@ object VertexAiResourceConfig extends scalapb.GeneratedMessageCompanion[snapchat schedulingStrategy = __fieldsMap.get(scalaDescriptor.findFieldByNumber(7).get).map(_.as[_root_.scala.Predef.String]).getOrElse(""), bootDiskSizeGb = __fieldsMap.get(scalaDescriptor.findFieldByNumber(8).get).map(_.as[_root_.scala.Int]).getOrElse(0), reservationAffinity = __fieldsMap.get(scalaDescriptor.findFieldByNumber(9).get).flatMap(_.as[_root_.scala.Option[snapchat.research.gbml.gigl_resource_config.VertexAiReservationAffinity]]), - tensorboardResourceName = __fieldsMap.get(scalaDescriptor.findFieldByNumber(10).get).map(_.as[_root_.scala.Predef.String]).getOrElse("") + tensorboardResourceName = __fieldsMap.get(scalaDescriptor.findFieldByNumber(10).get).map(_.as[_root_.scala.Predef.String]).getOrElse(""), + tensorboardExperimentName = __fieldsMap.get(scalaDescriptor.findFieldByNumber(11).get).map(_.as[_root_.scala.Predef.String]).getOrElse("") ) case _ => throw new RuntimeException("Expected PMessage") } @@ -375,7 +409,8 @@ object VertexAiResourceConfig extends scalapb.GeneratedMessageCompanion[snapchat schedulingStrategy = "", bootDiskSizeGb = 0, reservationAffinity = _root_.scala.None, - tensorboardResourceName = "" + tensorboardResourceName = "", + tensorboardExperimentName = "" ) implicit class VertexAiResourceConfigLens[UpperPB](_l: _root_.scalapb.lenses.Lens[UpperPB, snapchat.research.gbml.gigl_resource_config.VertexAiResourceConfig]) extends _root_.scalapb.lenses.ObjectLens[UpperPB, snapchat.research.gbml.gigl_resource_config.VertexAiResourceConfig](_l) { def machineType: _root_.scalapb.lenses.Lens[UpperPB, _root_.scala.Predef.String] = field(_.machineType)((c_, f_) => c_.copy(machineType = f_)) @@ -389,6 +424,7 @@ object VertexAiResourceConfig extends scalapb.GeneratedMessageCompanion[snapchat def reservationAffinity: _root_.scalapb.lenses.Lens[UpperPB, snapchat.research.gbml.gigl_resource_config.VertexAiReservationAffinity] = field(_.getReservationAffinity)((c_, f_) => c_.copy(reservationAffinity = Option(f_))) def optionalReservationAffinity: _root_.scalapb.lenses.Lens[UpperPB, _root_.scala.Option[snapchat.research.gbml.gigl_resource_config.VertexAiReservationAffinity]] = field(_.reservationAffinity)((c_, f_) => c_.copy(reservationAffinity = f_)) def tensorboardResourceName: _root_.scalapb.lenses.Lens[UpperPB, _root_.scala.Predef.String] = field(_.tensorboardResourceName)((c_, f_) => c_.copy(tensorboardResourceName = f_)) + def tensorboardExperimentName: _root_.scalapb.lenses.Lens[UpperPB, _root_.scala.Predef.String] = field(_.tensorboardExperimentName)((c_, f_) => c_.copy(tensorboardExperimentName = f_)) } final val MACHINE_TYPE_FIELD_NUMBER = 1 final val GPU_TYPE_FIELD_NUMBER = 2 @@ -400,6 +436,7 @@ object VertexAiResourceConfig extends scalapb.GeneratedMessageCompanion[snapchat final val BOOT_DISK_SIZE_GB_FIELD_NUMBER = 8 final val RESERVATION_AFFINITY_FIELD_NUMBER = 9 final val TENSORBOARD_RESOURCE_NAME_FIELD_NUMBER = 10 + final val TENSORBOARD_EXPERIMENT_NAME_FIELD_NUMBER = 11 def of( machineType: _root_.scala.Predef.String, gpuType: _root_.scala.Predef.String, @@ -410,7 +447,8 @@ object VertexAiResourceConfig extends scalapb.GeneratedMessageCompanion[snapchat schedulingStrategy: _root_.scala.Predef.String, bootDiskSizeGb: _root_.scala.Int, reservationAffinity: _root_.scala.Option[snapchat.research.gbml.gigl_resource_config.VertexAiReservationAffinity], - tensorboardResourceName: _root_.scala.Predef.String + tensorboardResourceName: _root_.scala.Predef.String, + tensorboardExperimentName: _root_.scala.Predef.String ): _root_.snapchat.research.gbml.gigl_resource_config.VertexAiResourceConfig = _root_.snapchat.research.gbml.gigl_resource_config.VertexAiResourceConfig( machineType, gpuType, @@ -421,7 +459,8 @@ object VertexAiResourceConfig extends scalapb.GeneratedMessageCompanion[snapchat schedulingStrategy, bootDiskSizeGb, reservationAffinity, - tensorboardResourceName + tensorboardResourceName, + tensorboardExperimentName ) // @@protoc_insertion_point(GeneratedMessageCompanion[snapchat.research.gbml.VertexAiResourceConfig]) } diff --git a/scala_spark35/common/src/main/scala/snapchat/research/gbml/gbml_config/GbmlConfig.scala b/scala_spark35/common/src/main/scala/snapchat/research/gbml/gbml_config/GbmlConfig.scala index 658cd15ae..63c31ede6 100644 --- a/scala_spark35/common/src/main/scala/snapchat/research/gbml/gbml_config/GbmlConfig.scala +++ b/scala_spark35/common/src/main/scala/snapchat/research/gbml/gbml_config/GbmlConfig.scala @@ -3966,16 +3966,6 @@ object GbmlConfig extends scalapb.GeneratedMessageCompanion[snapchat.research.gb * Arguments to parameterize training process with. * @param shouldLogToTensorboard * Weather to log to tensorboard or not (defaults to false) - * @param tensorboardExperimentName - * Optional. When set, the trainer's chief rank streams events to a - * TensorboardExperiment with this name on the configured Tensorboard - * resource, in addition to Vertex's built-in per-job auto-upload. - * Multiple jobs that share the same value land in the same - * TensorboardExperiment, so they appear as comparable runs on one - * TensorBoard page. Requires - * GiglResourceConfig...tensorboard_resource_name to be set. Allowed - * characters: lowercase letters, digits, hyphens (Vertex AI Experiment - * ID rules). */ @SerialVersionUID(0L) final case class TrainerConfig( @@ -3983,7 +3973,6 @@ object GbmlConfig extends scalapb.GeneratedMessageCompanion[snapchat.research.gb trainerArgs: _root_.scala.collection.immutable.Map[_root_.scala.Predef.String, _root_.scala.Predef.String] = _root_.scala.collection.immutable.Map.empty, executable: snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.Executable = snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.Executable.Empty, shouldLogToTensorboard: _root_.scala.Boolean = false, - tensorboardExperimentName: _root_.scala.Predef.String = "", storageConfig: snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.StorageConfig = snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.StorageConfig.Empty, unknownFields: _root_.scalapb.UnknownFieldSet = _root_.scalapb.UnknownFieldSet.empty ) extends scalapb.GeneratedMessage with scalapb.lenses.Updatable[TrainerConfig] { @@ -4017,13 +4006,6 @@ object GbmlConfig extends scalapb.GeneratedMessageCompanion[snapchat.research.gb __size += _root_.com.google.protobuf.CodedOutputStream.computeBoolSize(12, __value) } }; - - { - val __value = tensorboardExperimentName - if (!__value.isEmpty) { - __size += _root_.com.google.protobuf.CodedOutputStream.computeStringSize(14, __value) - } - }; if (storageConfig.graphStoreStorageConfig.isDefined) { val __value = storageConfig.graphStoreStorageConfig.get __size += 1 + _root_.com.google.protobuf.CodedOutputStream.computeUInt32SizeNoTag(__value.serializedSize) + __value.serializedSize @@ -4065,12 +4047,6 @@ object GbmlConfig extends scalapb.GeneratedMessageCompanion[snapchat.research.gb _output__.writeUInt32NoTag(__m.serializedSize) __m.writeTo(_output__) }; - { - val __v = tensorboardExperimentName - if (!__v.isEmpty) { - _output__.writeString(14, __v) - } - }; executable.clsPath.foreach { __v => val __m = __v _output__.writeString(100, __m) @@ -4091,7 +4067,6 @@ object GbmlConfig extends scalapb.GeneratedMessageCompanion[snapchat.research.gb def getCommand: _root_.scala.Predef.String = executable.command.getOrElse("") def withCommand(__v: _root_.scala.Predef.String): TrainerConfig = copy(executable = snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.Executable.Command(__v)) def withShouldLogToTensorboard(__v: _root_.scala.Boolean): TrainerConfig = copy(shouldLogToTensorboard = __v) - def withTensorboardExperimentName(__v: _root_.scala.Predef.String): TrainerConfig = copy(tensorboardExperimentName = __v) def getGraphStoreStorageConfig: snapchat.research.gbml.gbml_config.GbmlConfig.GraphStoreStorageConfig = storageConfig.graphStoreStorageConfig.getOrElse(snapchat.research.gbml.gbml_config.GbmlConfig.GraphStoreStorageConfig.defaultInstance) def withGraphStoreStorageConfig(__v: snapchat.research.gbml.gbml_config.GbmlConfig.GraphStoreStorageConfig): TrainerConfig = copy(storageConfig = snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.StorageConfig.GraphStoreStorageConfig(__v)) def clearExecutable: TrainerConfig = copy(executable = snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.Executable.Empty) @@ -4113,10 +4088,6 @@ object GbmlConfig extends scalapb.GeneratedMessageCompanion[snapchat.research.gb val __t = shouldLogToTensorboard if (__t != false) __t else null } - case 14 => { - val __t = tensorboardExperimentName - if (__t != "") __t else null - } case 13 => storageConfig.graphStoreStorageConfig.orNull } } @@ -4128,7 +4099,6 @@ object GbmlConfig extends scalapb.GeneratedMessageCompanion[snapchat.research.gb case 100 => executable.clsPath.map(_root_.scalapb.descriptors.PString(_)).getOrElse(_root_.scalapb.descriptors.PEmpty) case 101 => executable.command.map(_root_.scalapb.descriptors.PString(_)).getOrElse(_root_.scalapb.descriptors.PEmpty) case 12 => _root_.scalapb.descriptors.PBoolean(shouldLogToTensorboard) - case 14 => _root_.scalapb.descriptors.PString(tensorboardExperimentName) case 13 => storageConfig.graphStoreStorageConfig.map(_.toPMessage).getOrElse(_root_.scalapb.descriptors.PEmpty) } } @@ -4143,7 +4113,6 @@ object GbmlConfig extends scalapb.GeneratedMessageCompanion[snapchat.research.gb var __trainerClsPath: _root_.scala.Predef.String = "" val __trainerArgs: _root_.scala.collection.mutable.Builder[(_root_.scala.Predef.String, _root_.scala.Predef.String), _root_.scala.collection.immutable.Map[_root_.scala.Predef.String, _root_.scala.Predef.String]] = _root_.scala.collection.immutable.Map.newBuilder[_root_.scala.Predef.String, _root_.scala.Predef.String] var __shouldLogToTensorboard: _root_.scala.Boolean = false - var __tensorboardExperimentName: _root_.scala.Predef.String = "" var __executable: snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.Executable = snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.Executable.Empty var __storageConfig: snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.StorageConfig = snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.StorageConfig.Empty var `_unknownFields__`: _root_.scalapb.UnknownFieldSet.Builder = null @@ -4162,8 +4131,6 @@ object GbmlConfig extends scalapb.GeneratedMessageCompanion[snapchat.research.gb __executable = snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.Executable.Command(_input__.readStringRequireUtf8()) case 96 => __shouldLogToTensorboard = _input__.readBool() - case 114 => - __tensorboardExperimentName = _input__.readStringRequireUtf8() case 106 => __storageConfig = snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.StorageConfig.GraphStoreStorageConfig(__storageConfig.graphStoreStorageConfig.fold(_root_.scalapb.LiteParser.readMessage[snapchat.research.gbml.gbml_config.GbmlConfig.GraphStoreStorageConfig](_input__))(_root_.scalapb.LiteParser.readMessage(_input__, _))) case tag => @@ -4177,7 +4144,6 @@ object GbmlConfig extends scalapb.GeneratedMessageCompanion[snapchat.research.gb trainerClsPath = __trainerClsPath, trainerArgs = __trainerArgs.result(), shouldLogToTensorboard = __shouldLogToTensorboard, - tensorboardExperimentName = __tensorboardExperimentName, executable = __executable, storageConfig = __storageConfig, unknownFields = if (_unknownFields__ == null) _root_.scalapb.UnknownFieldSet.empty else _unknownFields__.result() @@ -4190,7 +4156,6 @@ object GbmlConfig extends scalapb.GeneratedMessageCompanion[snapchat.research.gb trainerClsPath = __fieldsMap.get(scalaDescriptor.findFieldByNumber(1).get).map(_.as[_root_.scala.Predef.String]).getOrElse(""), trainerArgs = __fieldsMap.get(scalaDescriptor.findFieldByNumber(2).get).map(_.as[_root_.scala.Seq[snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.TrainerArgsEntry]]).getOrElse(_root_.scala.Seq.empty).iterator.map(snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig._typemapper_trainerArgs.toCustom(_)).toMap, shouldLogToTensorboard = __fieldsMap.get(scalaDescriptor.findFieldByNumber(12).get).map(_.as[_root_.scala.Boolean]).getOrElse(false), - tensorboardExperimentName = __fieldsMap.get(scalaDescriptor.findFieldByNumber(14).get).map(_.as[_root_.scala.Predef.String]).getOrElse(""), executable = __fieldsMap.get(scalaDescriptor.findFieldByNumber(100).get).flatMap(_.as[_root_.scala.Option[_root_.scala.Predef.String]]).map(snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.Executable.ClsPath(_)) .orElse[snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.Executable](__fieldsMap.get(scalaDescriptor.findFieldByNumber(101).get).flatMap(_.as[_root_.scala.Option[_root_.scala.Predef.String]]).map(snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.Executable.Command(_))) .getOrElse(snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.Executable.Empty), @@ -4218,7 +4183,6 @@ object GbmlConfig extends scalapb.GeneratedMessageCompanion[snapchat.research.gb trainerClsPath = "", trainerArgs = _root_.scala.collection.immutable.Map.empty, shouldLogToTensorboard = false, - tensorboardExperimentName = "", executable = snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.Executable.Empty, storageConfig = snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.StorageConfig.Empty ) @@ -4429,7 +4393,6 @@ object GbmlConfig extends scalapb.GeneratedMessageCompanion[snapchat.research.gb def clsPath: _root_.scalapb.lenses.Lens[UpperPB, _root_.scala.Predef.String] = field(_.getClsPath)((c_, f_) => c_.copy(executable = snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.Executable.ClsPath(f_))) def command: _root_.scalapb.lenses.Lens[UpperPB, _root_.scala.Predef.String] = field(_.getCommand)((c_, f_) => c_.copy(executable = snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.Executable.Command(f_))) def shouldLogToTensorboard: _root_.scalapb.lenses.Lens[UpperPB, _root_.scala.Boolean] = field(_.shouldLogToTensorboard)((c_, f_) => c_.copy(shouldLogToTensorboard = f_)) - def tensorboardExperimentName: _root_.scalapb.lenses.Lens[UpperPB, _root_.scala.Predef.String] = field(_.tensorboardExperimentName)((c_, f_) => c_.copy(tensorboardExperimentName = f_)) def graphStoreStorageConfig: _root_.scalapb.lenses.Lens[UpperPB, snapchat.research.gbml.gbml_config.GbmlConfig.GraphStoreStorageConfig] = field(_.getGraphStoreStorageConfig)((c_, f_) => c_.copy(storageConfig = snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.StorageConfig.GraphStoreStorageConfig(f_))) def executable: _root_.scalapb.lenses.Lens[UpperPB, snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.Executable] = field(_.executable)((c_, f_) => c_.copy(executable = f_)) def storageConfig: _root_.scalapb.lenses.Lens[UpperPB, snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.StorageConfig] = field(_.storageConfig)((c_, f_) => c_.copy(storageConfig = f_)) @@ -4439,7 +4402,6 @@ object GbmlConfig extends scalapb.GeneratedMessageCompanion[snapchat.research.gb final val CLS_PATH_FIELD_NUMBER = 100 final val COMMAND_FIELD_NUMBER = 101 final val SHOULD_LOG_TO_TENSORBOARD_FIELD_NUMBER = 12 - final val TENSORBOARD_EXPERIMENT_NAME_FIELD_NUMBER = 14 final val GRAPH_STORE_STORAGE_CONFIG_FIELD_NUMBER = 13 @transient private[gbml_config] val _typemapper_trainerArgs: _root_.scalapb.TypeMapper[snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.TrainerArgsEntry, (_root_.scala.Predef.String, _root_.scala.Predef.String)] = implicitly[_root_.scalapb.TypeMapper[snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.TrainerArgsEntry, (_root_.scala.Predef.String, _root_.scala.Predef.String)]] @@ -4448,14 +4410,12 @@ object GbmlConfig extends scalapb.GeneratedMessageCompanion[snapchat.research.gb trainerArgs: _root_.scala.collection.immutable.Map[_root_.scala.Predef.String, _root_.scala.Predef.String], executable: snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.Executable, shouldLogToTensorboard: _root_.scala.Boolean, - tensorboardExperimentName: _root_.scala.Predef.String, storageConfig: snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig.StorageConfig ): _root_.snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig = _root_.snapchat.research.gbml.gbml_config.GbmlConfig.TrainerConfig( trainerClsPath, trainerArgs, executable, shouldLogToTensorboard, - tensorboardExperimentName, storageConfig ) // @@protoc_insertion_point(GeneratedMessageCompanion[snapchat.research.gbml.GbmlConfig.TrainerConfig]) diff --git a/scala_spark35/common/src/main/scala/snapchat/research/gbml/gbml_config/GbmlConfigProto.scala b/scala_spark35/common/src/main/scala/snapchat/research/gbml/gbml_config/GbmlConfigProto.scala index 5aa5dd8a7..a9c35d542 100644 --- a/scala_spark35/common/src/main/scala/snapchat/research/gbml/gbml_config/GbmlConfigProto.scala +++ b/scala_spark35/common/src/main/scala/snapchat/research/gbml/gbml_config/GbmlConfigProto.scala @@ -26,7 +26,7 @@ object GbmlConfigProto extends _root_.scalapb.GeneratedFileObject { GhfbWV0YWRhdGEucHJvdG8aLXNuYXBjaGF0L3Jlc2VhcmNoL2dibWwvZGF0YXNldF9tZXRhZGF0YS5wcm90bxozc25hcGNoYXQvc mVzZWFyY2gvZ2JtbC90cmFpbmVkX21vZGVsX21ldGFkYXRhLnByb3RvGi9zbmFwY2hhdC9yZXNlYXJjaC9nYm1sL2luZmVyZW5jZ V9tZXRhZGF0YS5wcm90bxozc25hcGNoYXQvcmVzZWFyY2gvZ2JtbC9wb3N0cHJvY2Vzc2VkX21ldGFkYXRhLnByb3RvGjdzbmFwY - 2hhdC9yZXNlYXJjaC9nYm1sL3N1YmdyYXBoX3NhbXBsaW5nX3N0cmF0ZWd5LnByb3RvIqpNCgpHYm1sQ29uZmlnEmcKDXRhc2tfb + 2hhdC9yZXNlYXJjaC9nYm1sL3N1YmdyYXBoX3NhbXBsaW5nX3N0cmF0ZWd5LnByb3RvIspMCgpHYm1sQ29uZmlnEmcKDXRhc2tfb WV0YWRhdGEYASABKAsyLy5zbmFwY2hhdC5yZXNlYXJjaC5nYm1sLkdibWxDb25maWcuVGFza01ldGFkYXRhQhHiPw4SDHRhc2tNZ XRhZGF0YVIMdGFza01ldGFkYXRhEmAKDmdyYXBoX21ldGFkYXRhGAIgASgLMiUuc25hcGNoYXQucmVzZWFyY2guZ2JtbC5HcmFwa E1ldGFkYXRhQhLiPw8SDWdyYXBoTWV0YWRhdGFSDWdyYXBoTWV0YWRhdGESZwoNc2hhcmVkX2NvbmZpZxgDIAEoCzIvLnNuYXBja @@ -125,40 +125,39 @@ object GbmlConfigProto extends _root_.scalapb.GeneratedFileObject { BgBIAEoCUIM4j8JEgdjb21tYW5kUgdjb21tYW5kEoABCgxzdG9yYWdlX2FyZ3MYAiADKAsySy5zbmFwY2hhdC5yZXNlYXJjaC5nY m1sLkdibWxDb25maWcuR3JhcGhTdG9yZVN0b3JhZ2VDb25maWcuU3RvcmFnZUFyZ3NFbnRyeUIQ4j8NEgtzdG9yYWdlQXJnc1ILc 3RvcmFnZUFyZ3MaVAoQU3RvcmFnZUFyZ3NFbnRyeRIaCgNrZXkYASABKAlCCOI/BRIDa2V5UgNrZXkSIAoFdmFsdWUYAiABKAlCC - uI/BxIFdmFsdWVSBXZhbHVlOgI4ARrjBQoNVHJhaW5lckNvbmZpZxI9ChB0cmFpbmVyX2Nsc19wYXRoGAEgASgJQhPiPxASDnRyY + uI/BxIFdmFsdWVSBXZhbHVlOgI4ARqDBQoNVHJhaW5lckNvbmZpZxI9ChB0cmFpbmVyX2Nsc19wYXRoGAEgASgJQhPiPxASDnRyY WluZXJDbHNQYXRoUg50cmFpbmVyQ2xzUGF0aBJ2Cgx0cmFpbmVyX2FyZ3MYAiADKAsyQS5zbmFwY2hhdC5yZXNlYXJjaC5nYm1sL kdibWxDb25maWcuVHJhaW5lckNvbmZpZy5UcmFpbmVyQXJnc0VudHJ5QhDiPw0SC3RyYWluZXJBcmdzUgt0cmFpbmVyQXJncxIpC ghjbHNfcGF0aBhkIAEoCUIM4j8JEgdjbHNQYXRoSABSB2Nsc1BhdGgSKAoHY29tbWFuZBhlIAEoCUIM4j8JEgdjb21tYW5kSABSB 2NvbW1hbmQSVgoZc2hvdWxkX2xvZ190b190ZW5zb3Jib2FyZBgMIAEoCEIb4j8YEhZzaG91bGRMb2dUb1RlbnNvcmJvYXJkUhZza - G91bGRMb2dUb1RlbnNvcmJvYXJkEl4KG3RlbnNvcmJvYXJkX2V4cGVyaW1lbnRfbmFtZRgOIAEoCUIe4j8bEhl0ZW5zb3Jib2FyZ - EV4cGVyaW1lbnROYW1lUhl0ZW5zb3Jib2FyZEV4cGVyaW1lbnROYW1lEpcBChpncmFwaF9zdG9yZV9zdG9yYWdlX2NvbmZpZxgNI - AEoCzI6LnNuYXBjaGF0LnJlc2VhcmNoLmdibWwuR2JtbENvbmZpZy5HcmFwaFN0b3JlU3RvcmFnZUNvbmZpZ0Ic4j8ZEhdncmFwa - FN0b3JlU3RvcmFnZUNvbmZpZ0gBUhdncmFwaFN0b3JlU3RvcmFnZUNvbmZpZxpUChBUcmFpbmVyQXJnc0VudHJ5EhoKA2tleRgBI - AEoCUII4j8FEgNrZXlSA2tleRIgCgV2YWx1ZRgCIAEoCUIK4j8HEgV2YWx1ZVIFdmFsdWU6AjgBQgwKCmV4ZWN1dGFibGVCEAoOc - 3RvcmFnZV9jb25maWcalQUKEEluZmVyZW5jZXJDb25maWcShQEKD2luZmVyZW5jZXJfYXJncxgBIAMoCzJHLnNuYXBjaGF0LnJlc - 2VhcmNoLmdibWwuR2JtbENvbmZpZy5JbmZlcmVuY2VyQ29uZmlnLkluZmVyZW5jZXJBcmdzRW50cnlCE+I/EBIOaW5mZXJlbmNlc - kFyZ3NSDmluZmVyZW5jZXJBcmdzEkYKE2luZmVyZW5jZXJfY2xzX3BhdGgYAiABKAlCFuI/ExIRaW5mZXJlbmNlckNsc1BhdGhSE - WluZmVyZW5jZXJDbHNQYXRoEikKCGNsc19wYXRoGGQgASgJQgziPwkSB2Nsc1BhdGhIAFIHY2xzUGF0aBIoCgdjb21tYW5kGGUgA - SgJQgziPwkSB2NvbW1hbmRIAFIHY29tbWFuZBJJChRpbmZlcmVuY2VfYmF0Y2hfc2l6ZRgFIAEoDUIX4j8UEhJpbmZlcmVuY2VCY - XRjaFNpemVSEmluZmVyZW5jZUJhdGNoU2l6ZRKXAQoaZ3JhcGhfc3RvcmVfc3RvcmFnZV9jb25maWcYBiABKAsyOi5zbmFwY2hhd - C5yZXNlYXJjaC5nYm1sLkdibWxDb25maWcuR3JhcGhTdG9yZVN0b3JhZ2VDb25maWdCHOI/GRIXZ3JhcGhTdG9yZVN0b3JhZ2VDb - 25maWdIAVIXZ3JhcGhTdG9yZVN0b3JhZ2VDb25maWcaVwoTSW5mZXJlbmNlckFyZ3NFbnRyeRIaCgNrZXkYASABKAlCCOI/BRIDa - 2V5UgNrZXkSIAoFdmFsdWUYAiABKAlCCuI/BxIFdmFsdWVSBXZhbHVlOgI4AUIMCgpleGVjdXRhYmxlQhAKDnN0b3JhZ2VfY29uZ - mlnGtsCChNQb3N0UHJvY2Vzc29yQ29uZmlnEpUBChNwb3N0X3Byb2Nlc3Nvcl9hcmdzGAEgAygLMk0uc25hcGNoYXQucmVzZWFyY - 2guZ2JtbC5HYm1sQ29uZmlnLlBvc3RQcm9jZXNzb3JDb25maWcuUG9zdFByb2Nlc3NvckFyZ3NFbnRyeUIW4j8TEhFwb3N0UHJvY - 2Vzc29yQXJnc1IRcG9zdFByb2Nlc3NvckFyZ3MSUAoXcG9zdF9wcm9jZXNzb3JfY2xzX3BhdGgYAiABKAlCGeI/FhIUcG9zdFByb - 2Nlc3NvckNsc1BhdGhSFHBvc3RQcm9jZXNzb3JDbHNQYXRoGloKFlBvc3RQcm9jZXNzb3JBcmdzRW50cnkSGgoDa2V5GAEgASgJQ - gjiPwUSA2tleVIDa2V5EiAKBXZhbHVlGAIgASgJQgriPwcSBXZhbHVlUgV2YWx1ZToCOAEanAIKDU1ldHJpY3NDb25maWcSPQoQb - WV0cmljc19jbHNfcGF0aBgBIAEoCUIT4j8QEg5tZXRyaWNzQ2xzUGF0aFIObWV0cmljc0Nsc1BhdGgSdgoMbWV0cmljc19hcmdzG - AIgAygLMkEuc25hcGNoYXQucmVzZWFyY2guZ2JtbC5HYm1sQ29uZmlnLk1ldHJpY3NDb25maWcuTWV0cmljc0FyZ3NFbnRyeUIQ4 - j8NEgttZXRyaWNzQXJnc1ILbWV0cmljc0FyZ3MaVAoQTWV0cmljc0FyZ3NFbnRyeRIaCgNrZXkYASABKAlCCOI/BRIDa2V5UgNrZ - XkSIAoFdmFsdWUYAiABKAlCCuI/BxIFdmFsdWVSBXZhbHVlOgI4ARr0AgoOUHJvZmlsZXJDb25maWcSTwoWc2hvdWxkX2VuYWJsZ - V9wcm9maWxlchgBIAEoCEIZ4j8WEhRzaG91bGRFbmFibGVQcm9maWxlclIUc2hvdWxkRW5hYmxlUHJvZmlsZXISPQoQcHJvZmlsZ - XJfbG9nX2RpchgCIAEoCUIT4j8QEg5wcm9maWxlckxvZ0RpclIOcHJvZmlsZXJMb2dEaXISewoNcHJvZmlsZXJfYXJncxgDIAMoC - zJDLnNuYXBjaGF0LnJlc2VhcmNoLmdibWwuR2JtbENvbmZpZy5Qcm9maWxlckNvbmZpZy5Qcm9maWxlckFyZ3NFbnRyeUIR4j8OE - gxwcm9maWxlckFyZ3NSDHByb2ZpbGVyQXJncxpVChFQcm9maWxlckFyZ3NFbnRyeRIaCgNrZXkYASABKAlCCOI/BRIDa2V5UgNrZ - XkSIAoFdmFsdWUYAiABKAlCCuI/BxIFdmFsdWVSBXZhbHVlOgI4ARpVChFGZWF0dXJlRmxhZ3NFbnRyeRIaCgNrZXkYASABKAlCC - OI/BRIDa2V5UgNrZXkSIAoFdmFsdWUYAiABKAlCCuI/BxIFdmFsdWVSBXZhbHVlOgI4AWIGcHJvdG8z""" + G91bGRMb2dUb1RlbnNvcmJvYXJkEpcBChpncmFwaF9zdG9yZV9zdG9yYWdlX2NvbmZpZxgNIAEoCzI6LnNuYXBjaGF0LnJlc2Vhc + mNoLmdibWwuR2JtbENvbmZpZy5HcmFwaFN0b3JlU3RvcmFnZUNvbmZpZ0Ic4j8ZEhdncmFwaFN0b3JlU3RvcmFnZUNvbmZpZ0gBU + hdncmFwaFN0b3JlU3RvcmFnZUNvbmZpZxpUChBUcmFpbmVyQXJnc0VudHJ5EhoKA2tleRgBIAEoCUII4j8FEgNrZXlSA2tleRIgC + gV2YWx1ZRgCIAEoCUIK4j8HEgV2YWx1ZVIFdmFsdWU6AjgBQgwKCmV4ZWN1dGFibGVCEAoOc3RvcmFnZV9jb25maWcalQUKEEluZ + mVyZW5jZXJDb25maWcShQEKD2luZmVyZW5jZXJfYXJncxgBIAMoCzJHLnNuYXBjaGF0LnJlc2VhcmNoLmdibWwuR2JtbENvbmZpZ + y5JbmZlcmVuY2VyQ29uZmlnLkluZmVyZW5jZXJBcmdzRW50cnlCE+I/EBIOaW5mZXJlbmNlckFyZ3NSDmluZmVyZW5jZXJBcmdzE + kYKE2luZmVyZW5jZXJfY2xzX3BhdGgYAiABKAlCFuI/ExIRaW5mZXJlbmNlckNsc1BhdGhSEWluZmVyZW5jZXJDbHNQYXRoEikKC + GNsc19wYXRoGGQgASgJQgziPwkSB2Nsc1BhdGhIAFIHY2xzUGF0aBIoCgdjb21tYW5kGGUgASgJQgziPwkSB2NvbW1hbmRIAFIHY + 29tbWFuZBJJChRpbmZlcmVuY2VfYmF0Y2hfc2l6ZRgFIAEoDUIX4j8UEhJpbmZlcmVuY2VCYXRjaFNpemVSEmluZmVyZW5jZUJhd + GNoU2l6ZRKXAQoaZ3JhcGhfc3RvcmVfc3RvcmFnZV9jb25maWcYBiABKAsyOi5zbmFwY2hhdC5yZXNlYXJjaC5nYm1sLkdibWxDb + 25maWcuR3JhcGhTdG9yZVN0b3JhZ2VDb25maWdCHOI/GRIXZ3JhcGhTdG9yZVN0b3JhZ2VDb25maWdIAVIXZ3JhcGhTdG9yZVN0b + 3JhZ2VDb25maWcaVwoTSW5mZXJlbmNlckFyZ3NFbnRyeRIaCgNrZXkYASABKAlCCOI/BRIDa2V5UgNrZXkSIAoFdmFsdWUYAiABK + AlCCuI/BxIFdmFsdWVSBXZhbHVlOgI4AUIMCgpleGVjdXRhYmxlQhAKDnN0b3JhZ2VfY29uZmlnGtsCChNQb3N0UHJvY2Vzc29yQ + 29uZmlnEpUBChNwb3N0X3Byb2Nlc3Nvcl9hcmdzGAEgAygLMk0uc25hcGNoYXQucmVzZWFyY2guZ2JtbC5HYm1sQ29uZmlnLlBvc + 3RQcm9jZXNzb3JDb25maWcuUG9zdFByb2Nlc3NvckFyZ3NFbnRyeUIW4j8TEhFwb3N0UHJvY2Vzc29yQXJnc1IRcG9zdFByb2Nlc + 3NvckFyZ3MSUAoXcG9zdF9wcm9jZXNzb3JfY2xzX3BhdGgYAiABKAlCGeI/FhIUcG9zdFByb2Nlc3NvckNsc1BhdGhSFHBvc3RQc + m9jZXNzb3JDbHNQYXRoGloKFlBvc3RQcm9jZXNzb3JBcmdzRW50cnkSGgoDa2V5GAEgASgJQgjiPwUSA2tleVIDa2V5EiAKBXZhb + HVlGAIgASgJQgriPwcSBXZhbHVlUgV2YWx1ZToCOAEanAIKDU1ldHJpY3NDb25maWcSPQoQbWV0cmljc19jbHNfcGF0aBgBIAEoC + UIT4j8QEg5tZXRyaWNzQ2xzUGF0aFIObWV0cmljc0Nsc1BhdGgSdgoMbWV0cmljc19hcmdzGAIgAygLMkEuc25hcGNoYXQucmVzZ + WFyY2guZ2JtbC5HYm1sQ29uZmlnLk1ldHJpY3NDb25maWcuTWV0cmljc0FyZ3NFbnRyeUIQ4j8NEgttZXRyaWNzQXJnc1ILbWV0c + mljc0FyZ3MaVAoQTWV0cmljc0FyZ3NFbnRyeRIaCgNrZXkYASABKAlCCOI/BRIDa2V5UgNrZXkSIAoFdmFsdWUYAiABKAlCCuI/B + xIFdmFsdWVSBXZhbHVlOgI4ARr0AgoOUHJvZmlsZXJDb25maWcSTwoWc2hvdWxkX2VuYWJsZV9wcm9maWxlchgBIAEoCEIZ4j8WE + hRzaG91bGRFbmFibGVQcm9maWxlclIUc2hvdWxkRW5hYmxlUHJvZmlsZXISPQoQcHJvZmlsZXJfbG9nX2RpchgCIAEoCUIT4j8QE + g5wcm9maWxlckxvZ0RpclIOcHJvZmlsZXJMb2dEaXISewoNcHJvZmlsZXJfYXJncxgDIAMoCzJDLnNuYXBjaGF0LnJlc2VhcmNoL + mdibWwuR2JtbENvbmZpZy5Qcm9maWxlckNvbmZpZy5Qcm9maWxlckFyZ3NFbnRyeUIR4j8OEgxwcm9maWxlckFyZ3NSDHByb2Zpb + GVyQXJncxpVChFQcm9maWxlckFyZ3NFbnRyeRIaCgNrZXkYASABKAlCCOI/BRIDa2V5UgNrZXkSIAoFdmFsdWUYAiABKAlCCuI/B + xIFdmFsdWVSBXZhbHVlOgI4ARpVChFGZWF0dXJlRmxhZ3NFbnRyeRIaCgNrZXkYASABKAlCCOI/BRIDa2V5UgNrZXkSIAoFdmFsd + WUYAiABKAlCCuI/BxIFdmFsdWVSBXZhbHVlOgI4AWIGcHJvdG8z""" ).mkString) lazy val scalaDescriptor: _root_.scalapb.descriptors.FileDescriptor = { val scalaProto = com.google.protobuf.descriptor.FileDescriptorProto.parseFrom(ProtoBytes) diff --git a/scala_spark35/common/src/main/scala/snapchat/research/gbml/gigl_resource_config/GiglResourceConfigProto.scala b/scala_spark35/common/src/main/scala/snapchat/research/gbml/gigl_resource_config/GiglResourceConfigProto.scala index 94ffd417b..da5ed6523 100644 --- a/scala_spark35/common/src/main/scala/snapchat/research/gbml/gigl_resource_config/GiglResourceConfigProto.scala +++ b/scala_spark35/common/src/main/scala/snapchat/research/gbml/gigl_resource_config/GiglResourceConfigProto.scala @@ -48,7 +48,7 @@ object GiglResourceConfigProto extends _root_.scalapb.GeneratedFileObject { XQSMwoMbnVtX3JlcGxpY2FzGAUgASgNQhDiPw0SC251bVJlcGxpY2FzUgtudW1SZXBsaWNhcyJGChJMb2NhbFRyYWluZXJDb25ma WcSMAoLbnVtX3dvcmtlcnMYASABKA1CD+I/DBIKbnVtV29ya2Vyc1IKbnVtV29ya2VycyKZAQobVmVydGV4QWlSZXNlcnZhdGlvb kFmZmluaXR5Eh0KBHR5cGUYASABKAlCCeI/BhIEdHlwZVIEdHlwZRJbChpyZXNlcnZhdGlvbl9yZXNvdXJjZV9uYW1lcxgCIAMoC - UId4j8aEhhyZXNlcnZhdGlvblJlc291cmNlTmFtZXNSGHJlc2VydmF0aW9uUmVzb3VyY2VOYW1lcyKuBQoWVmVydGV4QWlSZXNvd + UId4j8aEhhyZXNlcnZhdGlvblJlc291cmNlTmFtZXNSGHJlc2VydmF0aW9uUmVzb3VyY2VOYW1lcyKOBgoWVmVydGV4QWlSZXNvd XJjZUNvbmZpZxIzCgxtYWNoaW5lX3R5cGUYASABKAlCEOI/DRILbWFjaGluZVR5cGVSC21hY2hpbmVUeXBlEicKCGdwdV90eXBlG AIgASgJQgziPwkSB2dwdVR5cGVSB2dwdVR5cGUSKgoJZ3B1X2xpbWl0GAMgASgNQg3iPwoSCGdwdUxpbWl0UghncHVMaW1pdBIzC gxudW1fcmVwbGljYXMYBCABKA1CEOI/DRILbnVtUmVwbGljYXNSC251bVJlcGxpY2FzEiYKB3RpbWVvdXQYBSABKA1CDOI/CRIHd @@ -57,75 +57,76 @@ object GiglResourceConfigProto extends _root_.scalapb.GeneratedFileObject { GluZ1N0cmF0ZWd5Ej4KEWJvb3RfZGlza19zaXplX2diGAggASgNQhPiPxASDmJvb3REaXNrU2l6ZUdiUg5ib290RGlza1NpemVHY hKAAQoUcmVzZXJ2YXRpb25fYWZmaW5pdHkYCSABKAsyMy5zbmFwY2hhdC5yZXNlYXJjaC5nYm1sLlZlcnRleEFpUmVzZXJ2YXRpb 25BZmZpbml0eUIY4j8VEhNyZXNlcnZhdGlvbkFmZmluaXR5UhNyZXNlcnZhdGlvbkFmZmluaXR5ElgKGXRlbnNvcmJvYXJkX3Jlc - 291cmNlX25hbWUYCiABKAlCHOI/GRIXdGVuc29yYm9hcmRSZXNvdXJjZU5hbWVSF3RlbnNvcmJvYXJkUmVzb3VyY2VOYW1lIooCC - hFLRlBSZXNvdXJjZUNvbmZpZxIwCgtjcHVfcmVxdWVzdBgBIAEoCUIP4j8MEgpjcHVSZXF1ZXN0UgpjcHVSZXF1ZXN0EjkKDm1lb - W9yeV9yZXF1ZXN0GAIgASgJQhLiPw8SDW1lbW9yeVJlcXVlc3RSDW1lbW9yeVJlcXVlc3QSJwoIZ3B1X3R5cGUYAyABKAlCDOI/C - RIHZ3B1VHlwZVIHZ3B1VHlwZRIqCglncHVfbGltaXQYBCABKA1CDeI/ChIIZ3B1TGltaXRSCGdwdUxpbWl0EjMKDG51bV9yZXBsa - WNhcxgFIAEoDUIQ4j8NEgtudW1SZXBsaWNhc1ILbnVtUmVwbGljYXMiRwoTTG9jYWxSZXNvdXJjZUNvbmZpZxIwCgtudW1fd29ya - 2VycxgBIAEoDUIP4j8MEgpudW1Xb3JrZXJzUgpudW1Xb3JrZXJzItkCChhWZXJ0ZXhBaUdyYXBoU3RvcmVDb25maWcSbQoQZ3Jhc - Ghfc3RvcmVfcG9vbBgBIAEoCzIuLnNuYXBjaGF0LnJlc2VhcmNoLmdibWwuVmVydGV4QWlSZXNvdXJjZUNvbmZpZ0IT4j8QEg5nc - mFwaFN0b3JlUG9vbFIOZ3JhcGhTdG9yZVBvb2wSYwoMY29tcHV0ZV9wb29sGAIgASgLMi4uc25hcGNoYXQucmVzZWFyY2guZ2Jtb - C5WZXJ0ZXhBaVJlc291cmNlQ29uZmlnQhDiPw0SC2NvbXB1dGVQb29sUgtjb21wdXRlUG9vbBJpCiBjb21wdXRlX2NsdXN0ZXJfb - G9jYWxfd29ybGRfc2l6ZRgDIAEoBUIh4j8eEhxjb21wdXRlQ2x1c3RlckxvY2FsV29ybGRTaXplUhxjb21wdXRlQ2x1c3RlckxvY - 2FsV29ybGRTaXplIp0DChhEaXN0cmlidXRlZFRyYWluZXJDb25maWcShAEKGHZlcnRleF9haV90cmFpbmVyX2NvbmZpZxgBIAEoC - zItLnNuYXBjaGF0LnJlc2VhcmNoLmdibWwuVmVydGV4QWlUcmFpbmVyQ29uZmlnQhriPxcSFXZlcnRleEFpVHJhaW5lckNvbmZpZ - 0gAUhV2ZXJ0ZXhBaVRyYWluZXJDb25maWcSbwoSa2ZwX3RyYWluZXJfY29uZmlnGAIgASgLMiguc25hcGNoYXQucmVzZWFyY2guZ - 2JtbC5LRlBUcmFpbmVyQ29uZmlnQhXiPxISEGtmcFRyYWluZXJDb25maWdIAFIQa2ZwVHJhaW5lckNvbmZpZxJ3ChRsb2NhbF90c - mFpbmVyX2NvbmZpZxgDIAEoCzIqLnNuYXBjaGF0LnJlc2VhcmNoLmdibWwuTG9jYWxUcmFpbmVyQ29uZmlnQhfiPxQSEmxvY2FsV - HJhaW5lckNvbmZpZ0gAUhJsb2NhbFRyYWluZXJDb25maWdCEAoOdHJhaW5lcl9jb25maWcixwQKFVRyYWluZXJSZXNvdXJjZUNvb - mZpZxKFAQoYdmVydGV4X2FpX3RyYWluZXJfY29uZmlnGAEgASgLMi4uc25hcGNoYXQucmVzZWFyY2guZ2JtbC5WZXJ0ZXhBaVJlc - 291cmNlQ29uZmlnQhriPxcSFXZlcnRleEFpVHJhaW5lckNvbmZpZ0gAUhV2ZXJ0ZXhBaVRyYWluZXJDb25maWcScAoSa2ZwX3RyY - WluZXJfY29uZmlnGAIgASgLMikuc25hcGNoYXQucmVzZWFyY2guZ2JtbC5LRlBSZXNvdXJjZUNvbmZpZ0IV4j8SEhBrZnBUcmFpb - mVyQ29uZmlnSABSEGtmcFRyYWluZXJDb25maWcSeAoUbG9jYWxfdHJhaW5lcl9jb25maWcYAyABKAsyKy5zbmFwY2hhdC5yZXNlY - XJjaC5nYm1sLkxvY2FsUmVzb3VyY2VDb25maWdCF+I/FBISbG9jYWxUcmFpbmVyQ29uZmlnSABSEmxvY2FsVHJhaW5lckNvbmZpZ - xKnAQokdmVydGV4X2FpX2dyYXBoX3N0b3JlX3RyYWluZXJfY29uZmlnGAQgASgLMjAuc25hcGNoYXQucmVzZWFyY2guZ2JtbC5WZ - XJ0ZXhBaUdyYXBoU3RvcmVDb25maWdCJOI/IRIfdmVydGV4QWlHcmFwaFN0b3JlVHJhaW5lckNvbmZpZ0gAUh92ZXJ0ZXhBaUdyY - XBoU3RvcmVUcmFpbmVyQ29uZmlnQhAKDnRyYWluZXJfY29uZmlnIocFChhJbmZlcmVuY2VyUmVzb3VyY2VDb25maWcSjgEKG3Zlc - nRleF9haV9pbmZlcmVuY2VyX2NvbmZpZxgBIAEoCzIuLnNuYXBjaGF0LnJlc2VhcmNoLmdibWwuVmVydGV4QWlSZXNvdXJjZUNvb - mZpZ0Id4j8aEhh2ZXJ0ZXhBaUluZmVyZW5jZXJDb25maWdIAFIYdmVydGV4QWlJbmZlcmVuY2VyQ29uZmlnEo0BChpkYXRhZmxvd - 19pbmZlcmVuY2VyX2NvbmZpZxgCIAEoCzIuLnNuYXBjaGF0LnJlc2VhcmNoLmdibWwuRGF0YWZsb3dSZXNvdXJjZUNvbmZpZ0Id4 - j8aEhhkYXRhZmxvd0luZmVyZW5jZXJDb25maWdIAFIYZGF0YWZsb3dJbmZlcmVuY2VyQ29uZmlnEoEBChdsb2NhbF9pbmZlcmVuY - 2VyX2NvbmZpZxgDIAEoCzIrLnNuYXBjaGF0LnJlc2VhcmNoLmdibWwuTG9jYWxSZXNvdXJjZUNvbmZpZ0Ia4j8XEhVsb2NhbEluZ - mVyZW5jZXJDb25maWdIAFIVbG9jYWxJbmZlcmVuY2VyQ29uZmlnErABCid2ZXJ0ZXhfYWlfZ3JhcGhfc3RvcmVfaW5mZXJlbmNlc - l9jb25maWcYBCABKAsyMC5zbmFwY2hhdC5yZXNlYXJjaC5nYm1sLlZlcnRleEFpR3JhcGhTdG9yZUNvbmZpZ0In4j8kEiJ2ZXJ0Z - XhBaUdyYXBoU3RvcmVJbmZlcmVuY2VyQ29uZmlnSABSInZlcnRleEFpR3JhcGhTdG9yZUluZmVyZW5jZXJDb25maWdCEwoRaW5mZ - XJlbmNlcl9jb25maWcilwgKFFNoYXJlZFJlc291cmNlQ29uZmlnEn4KD3Jlc291cmNlX2xhYmVscxgBIAMoCzJALnNuYXBjaGF0L - nJlc2VhcmNoLmdibWwuU2hhcmVkUmVzb3VyY2VDb25maWcuUmVzb3VyY2VMYWJlbHNFbnRyeUIT4j8QEg5yZXNvdXJjZUxhYmVsc - 1IOcmVzb3VyY2VMYWJlbHMSjgEKFWNvbW1vbl9jb21wdXRlX2NvbmZpZxgCIAEoCzJALnNuYXBjaGF0LnJlc2VhcmNoLmdibWwuU - 2hhcmVkUmVzb3VyY2VDb25maWcuQ29tbW9uQ29tcHV0ZUNvbmZpZ0IY4j8VEhNjb21tb25Db21wdXRlQ29uZmlnUhNjb21tb25Db - 21wdXRlQ29uZmlnGpQFChNDb21tb25Db21wdXRlQ29uZmlnEiYKB3Byb2plY3QYASABKAlCDOI/CRIHcHJvamVjdFIHcHJvamVjd - BIjCgZyZWdpb24YAiABKAlCC+I/CBIGcmVnaW9uUgZyZWdpb24SQwoSdGVtcF9hc3NldHNfYnVja2V0GAMgASgJQhXiPxISEHRlb - XBBc3NldHNCdWNrZXRSEHRlbXBBc3NldHNCdWNrZXQSXAobdGVtcF9yZWdpb25hbF9hc3NldHNfYnVja2V0GAQgASgJQh3iPxoSG - HRlbXBSZWdpb25hbEFzc2V0c0J1Y2tldFIYdGVtcFJlZ2lvbmFsQXNzZXRzQnVja2V0EkMKEnBlcm1fYXNzZXRzX2J1Y2tldBgFI - AEoCUIV4j8SEhBwZXJtQXNzZXRzQnVja2V0UhBwZXJtQXNzZXRzQnVja2V0EloKG3RlbXBfYXNzZXRzX2JxX2RhdGFzZXRfbmFtZ - RgGIAEoCUIc4j8ZEhd0ZW1wQXNzZXRzQnFEYXRhc2V0TmFtZVIXdGVtcEFzc2V0c0JxRGF0YXNldE5hbWUSVgoZZW1iZWRkaW5nX - 2JxX2RhdGFzZXRfbmFtZRgHIAEoCUIb4j8YEhZlbWJlZGRpbmdCcURhdGFzZXROYW1lUhZlbWJlZGRpbmdCcURhdGFzZXROYW1lE - lYKGWdjcF9zZXJ2aWNlX2FjY291bnRfZW1haWwYCCABKAlCG+I/GBIWZ2NwU2VydmljZUFjY291bnRFbWFpbFIWZ2NwU2VydmljZ - UFjY291bnRFbWFpbBI8Cg9kYXRhZmxvd19ydW5uZXIYCyABKAlCE+I/EBIOZGF0YWZsb3dSdW5uZXJSDmRhdGFmbG93UnVubmVyG - lcKE1Jlc291cmNlTGFiZWxzRW50cnkSGgoDa2V5GAEgASgJQgjiPwUSA2tleVIDa2V5EiAKBXZhbHVlGAIgASgJQgriPwcSBXZhb - HVlUgV2YWx1ZToCOAEi9wgKEkdpZ2xSZXNvdXJjZUNvbmZpZxJbChpzaGFyZWRfcmVzb3VyY2VfY29uZmlnX3VyaRgBIAEoCUIc4 - j8ZEhdzaGFyZWRSZXNvdXJjZUNvbmZpZ1VyaUgAUhdzaGFyZWRSZXNvdXJjZUNvbmZpZ1VyaRJ/ChZzaGFyZWRfcmVzb3VyY2VfY - 29uZmlnGAIgASgLMiwuc25hcGNoYXQucmVzZWFyY2guZ2JtbC5TaGFyZWRSZXNvdXJjZUNvbmZpZ0IZ4j8WEhRzaGFyZWRSZXNvd - XJjZUNvbmZpZ0gAUhRzaGFyZWRSZXNvdXJjZUNvbmZpZxJ4ChNwcmVwcm9jZXNzb3JfY29uZmlnGAwgASgLMi4uc25hcGNoYXQuc - mVzZWFyY2guZ2JtbC5EYXRhUHJlcHJvY2Vzc29yQ29uZmlnQhfiPxQSEnByZXByb2Nlc3NvckNvbmZpZ1IScHJlcHJvY2Vzc29yQ - 29uZmlnEn8KF3N1YmdyYXBoX3NhbXBsZXJfY29uZmlnGA0gASgLMisuc25hcGNoYXQucmVzZWFyY2guZ2JtbC5TcGFya1Jlc291c - mNlQ29uZmlnQhriPxcSFXN1YmdyYXBoU2FtcGxlckNvbmZpZ1IVc3ViZ3JhcGhTYW1wbGVyQ29uZmlnEnwKFnNwbGl0X2dlbmVyY - XRvcl9jb25maWcYDiABKAsyKy5zbmFwY2hhdC5yZXNlYXJjaC5nYm1sLlNwYXJrUmVzb3VyY2VDb25maWdCGeI/FhIUc3BsaXRHZ - W5lcmF0b3JDb25maWdSFHNwbGl0R2VuZXJhdG9yQ29uZmlnEm0KDnRyYWluZXJfY29uZmlnGA8gASgLMjAuc25hcGNoYXQucmVzZ - WFyY2guZ2JtbC5EaXN0cmlidXRlZFRyYWluZXJDb25maWdCFBgB4j8PEg10cmFpbmVyQ29uZmlnUg10cmFpbmVyQ29uZmlnEnQKE - WluZmVyZW5jZXJfY29uZmlnGBAgASgLMi4uc25hcGNoYXQucmVzZWFyY2guZ2JtbC5EYXRhZmxvd1Jlc291cmNlQ29uZmlnQhcYA - eI/EhIQaW5mZXJlbmNlckNvbmZpZ1IQaW5mZXJlbmNlckNvbmZpZxKBAQoXdHJhaW5lcl9yZXNvdXJjZV9jb25maWcYESABKAsyL - S5zbmFwY2hhdC5yZXNlYXJjaC5nYm1sLlRyYWluZXJSZXNvdXJjZUNvbmZpZ0Ia4j8XEhV0cmFpbmVyUmVzb3VyY2VDb25maWdSF - XRyYWluZXJSZXNvdXJjZUNvbmZpZxKNAQoaaW5mZXJlbmNlcl9yZXNvdXJjZV9jb25maWcYEiABKAsyMC5zbmFwY2hhdC5yZXNlY - XJjaC5nYm1sLkluZmVyZW5jZXJSZXNvdXJjZUNvbmZpZ0Id4j8aEhhpbmZlcmVuY2VyUmVzb3VyY2VDb25maWdSGGluZmVyZW5jZ - XJSZXNvdXJjZUNvbmZpZ0IRCg9zaGFyZWRfcmVzb3VyY2Uq4wMKCUNvbXBvbmVudBItChFDb21wb25lbnRfVW5rbm93bhAAGhbiP - xMSEUNvbXBvbmVudF9Vbmtub3duEj8KGkNvbXBvbmVudF9Db25maWdfVmFsaWRhdG9yEAEaH+I/HBIaQ29tcG9uZW50X0NvbmZpZ - 19WYWxpZGF0b3ISPwoaQ29tcG9uZW50X0NvbmZpZ19Qb3B1bGF0b3IQAhof4j8cEhpDb21wb25lbnRfQ29uZmlnX1BvcHVsYXRvc - hJBChtDb21wb25lbnRfRGF0YV9QcmVwcm9jZXNzb3IQAxog4j8dEhtDb21wb25lbnRfRGF0YV9QcmVwcm9jZXNzb3ISPwoaQ29tc - G9uZW50X1N1YmdyYXBoX1NhbXBsZXIQBBof4j8cEhpDb21wb25lbnRfU3ViZ3JhcGhfU2FtcGxlchI9ChlDb21wb25lbnRfU3Bsa - XRfR2VuZXJhdG9yEAUaHuI/GxIZQ29tcG9uZW50X1NwbGl0X0dlbmVyYXRvchItChFDb21wb25lbnRfVHJhaW5lchAGGhbiPxMSE - UNvbXBvbmVudF9UcmFpbmVyEjMKFENvbXBvbmVudF9JbmZlcmVuY2VyEAcaGeI/FhIUQ29tcG9uZW50X0luZmVyZW5jZXJiBnByb - 3RvMw==""" + 291cmNlX25hbWUYCiABKAlCHOI/GRIXdGVuc29yYm9hcmRSZXNvdXJjZU5hbWVSF3RlbnNvcmJvYXJkUmVzb3VyY2VOYW1lEl4KG + 3RlbnNvcmJvYXJkX2V4cGVyaW1lbnRfbmFtZRgLIAEoCUIe4j8bEhl0ZW5zb3Jib2FyZEV4cGVyaW1lbnROYW1lUhl0ZW5zb3Jib + 2FyZEV4cGVyaW1lbnROYW1lIooCChFLRlBSZXNvdXJjZUNvbmZpZxIwCgtjcHVfcmVxdWVzdBgBIAEoCUIP4j8MEgpjcHVSZXF1Z + XN0UgpjcHVSZXF1ZXN0EjkKDm1lbW9yeV9yZXF1ZXN0GAIgASgJQhLiPw8SDW1lbW9yeVJlcXVlc3RSDW1lbW9yeVJlcXVlc3QSJ + woIZ3B1X3R5cGUYAyABKAlCDOI/CRIHZ3B1VHlwZVIHZ3B1VHlwZRIqCglncHVfbGltaXQYBCABKA1CDeI/ChIIZ3B1TGltaXRSC + GdwdUxpbWl0EjMKDG51bV9yZXBsaWNhcxgFIAEoDUIQ4j8NEgtudW1SZXBsaWNhc1ILbnVtUmVwbGljYXMiRwoTTG9jYWxSZXNvd + XJjZUNvbmZpZxIwCgtudW1fd29ya2VycxgBIAEoDUIP4j8MEgpudW1Xb3JrZXJzUgpudW1Xb3JrZXJzItkCChhWZXJ0ZXhBaUdyY + XBoU3RvcmVDb25maWcSbQoQZ3JhcGhfc3RvcmVfcG9vbBgBIAEoCzIuLnNuYXBjaGF0LnJlc2VhcmNoLmdibWwuVmVydGV4QWlSZ + XNvdXJjZUNvbmZpZ0IT4j8QEg5ncmFwaFN0b3JlUG9vbFIOZ3JhcGhTdG9yZVBvb2wSYwoMY29tcHV0ZV9wb29sGAIgASgLMi4uc + 25hcGNoYXQucmVzZWFyY2guZ2JtbC5WZXJ0ZXhBaVJlc291cmNlQ29uZmlnQhDiPw0SC2NvbXB1dGVQb29sUgtjb21wdXRlUG9vb + BJpCiBjb21wdXRlX2NsdXN0ZXJfbG9jYWxfd29ybGRfc2l6ZRgDIAEoBUIh4j8eEhxjb21wdXRlQ2x1c3RlckxvY2FsV29ybGRTa + XplUhxjb21wdXRlQ2x1c3RlckxvY2FsV29ybGRTaXplIp0DChhEaXN0cmlidXRlZFRyYWluZXJDb25maWcShAEKGHZlcnRleF9ha + V90cmFpbmVyX2NvbmZpZxgBIAEoCzItLnNuYXBjaGF0LnJlc2VhcmNoLmdibWwuVmVydGV4QWlUcmFpbmVyQ29uZmlnQhriPxcSF + XZlcnRleEFpVHJhaW5lckNvbmZpZ0gAUhV2ZXJ0ZXhBaVRyYWluZXJDb25maWcSbwoSa2ZwX3RyYWluZXJfY29uZmlnGAIgASgLM + iguc25hcGNoYXQucmVzZWFyY2guZ2JtbC5LRlBUcmFpbmVyQ29uZmlnQhXiPxISEGtmcFRyYWluZXJDb25maWdIAFIQa2ZwVHJha + W5lckNvbmZpZxJ3ChRsb2NhbF90cmFpbmVyX2NvbmZpZxgDIAEoCzIqLnNuYXBjaGF0LnJlc2VhcmNoLmdibWwuTG9jYWxUcmFpb + mVyQ29uZmlnQhfiPxQSEmxvY2FsVHJhaW5lckNvbmZpZ0gAUhJsb2NhbFRyYWluZXJDb25maWdCEAoOdHJhaW5lcl9jb25maWcix + wQKFVRyYWluZXJSZXNvdXJjZUNvbmZpZxKFAQoYdmVydGV4X2FpX3RyYWluZXJfY29uZmlnGAEgASgLMi4uc25hcGNoYXQucmVzZ + WFyY2guZ2JtbC5WZXJ0ZXhBaVJlc291cmNlQ29uZmlnQhriPxcSFXZlcnRleEFpVHJhaW5lckNvbmZpZ0gAUhV2ZXJ0ZXhBaVRyY + WluZXJDb25maWcScAoSa2ZwX3RyYWluZXJfY29uZmlnGAIgASgLMikuc25hcGNoYXQucmVzZWFyY2guZ2JtbC5LRlBSZXNvdXJjZ + UNvbmZpZ0IV4j8SEhBrZnBUcmFpbmVyQ29uZmlnSABSEGtmcFRyYWluZXJDb25maWcSeAoUbG9jYWxfdHJhaW5lcl9jb25maWcYA + yABKAsyKy5zbmFwY2hhdC5yZXNlYXJjaC5nYm1sLkxvY2FsUmVzb3VyY2VDb25maWdCF+I/FBISbG9jYWxUcmFpbmVyQ29uZmlnS + ABSEmxvY2FsVHJhaW5lckNvbmZpZxKnAQokdmVydGV4X2FpX2dyYXBoX3N0b3JlX3RyYWluZXJfY29uZmlnGAQgASgLMjAuc25hc + GNoYXQucmVzZWFyY2guZ2JtbC5WZXJ0ZXhBaUdyYXBoU3RvcmVDb25maWdCJOI/IRIfdmVydGV4QWlHcmFwaFN0b3JlVHJhaW5lc + kNvbmZpZ0gAUh92ZXJ0ZXhBaUdyYXBoU3RvcmVUcmFpbmVyQ29uZmlnQhAKDnRyYWluZXJfY29uZmlnIocFChhJbmZlcmVuY2VyU + mVzb3VyY2VDb25maWcSjgEKG3ZlcnRleF9haV9pbmZlcmVuY2VyX2NvbmZpZxgBIAEoCzIuLnNuYXBjaGF0LnJlc2VhcmNoLmdib + WwuVmVydGV4QWlSZXNvdXJjZUNvbmZpZ0Id4j8aEhh2ZXJ0ZXhBaUluZmVyZW5jZXJDb25maWdIAFIYdmVydGV4QWlJbmZlcmVuY + 2VyQ29uZmlnEo0BChpkYXRhZmxvd19pbmZlcmVuY2VyX2NvbmZpZxgCIAEoCzIuLnNuYXBjaGF0LnJlc2VhcmNoLmdibWwuRGF0Y + WZsb3dSZXNvdXJjZUNvbmZpZ0Id4j8aEhhkYXRhZmxvd0luZmVyZW5jZXJDb25maWdIAFIYZGF0YWZsb3dJbmZlcmVuY2VyQ29uZ + mlnEoEBChdsb2NhbF9pbmZlcmVuY2VyX2NvbmZpZxgDIAEoCzIrLnNuYXBjaGF0LnJlc2VhcmNoLmdibWwuTG9jYWxSZXNvdXJjZ + UNvbmZpZ0Ia4j8XEhVsb2NhbEluZmVyZW5jZXJDb25maWdIAFIVbG9jYWxJbmZlcmVuY2VyQ29uZmlnErABCid2ZXJ0ZXhfYWlfZ + 3JhcGhfc3RvcmVfaW5mZXJlbmNlcl9jb25maWcYBCABKAsyMC5zbmFwY2hhdC5yZXNlYXJjaC5nYm1sLlZlcnRleEFpR3JhcGhTd + G9yZUNvbmZpZ0In4j8kEiJ2ZXJ0ZXhBaUdyYXBoU3RvcmVJbmZlcmVuY2VyQ29uZmlnSABSInZlcnRleEFpR3JhcGhTdG9yZUluZ + mVyZW5jZXJDb25maWdCEwoRaW5mZXJlbmNlcl9jb25maWcilwgKFFNoYXJlZFJlc291cmNlQ29uZmlnEn4KD3Jlc291cmNlX2xhY + mVscxgBIAMoCzJALnNuYXBjaGF0LnJlc2VhcmNoLmdibWwuU2hhcmVkUmVzb3VyY2VDb25maWcuUmVzb3VyY2VMYWJlbHNFbnRye + UIT4j8QEg5yZXNvdXJjZUxhYmVsc1IOcmVzb3VyY2VMYWJlbHMSjgEKFWNvbW1vbl9jb21wdXRlX2NvbmZpZxgCIAEoCzJALnNuY + XBjaGF0LnJlc2VhcmNoLmdibWwuU2hhcmVkUmVzb3VyY2VDb25maWcuQ29tbW9uQ29tcHV0ZUNvbmZpZ0IY4j8VEhNjb21tb25Db + 21wdXRlQ29uZmlnUhNjb21tb25Db21wdXRlQ29uZmlnGpQFChNDb21tb25Db21wdXRlQ29uZmlnEiYKB3Byb2plY3QYASABKAlCD + OI/CRIHcHJvamVjdFIHcHJvamVjdBIjCgZyZWdpb24YAiABKAlCC+I/CBIGcmVnaW9uUgZyZWdpb24SQwoSdGVtcF9hc3NldHNfY + nVja2V0GAMgASgJQhXiPxISEHRlbXBBc3NldHNCdWNrZXRSEHRlbXBBc3NldHNCdWNrZXQSXAobdGVtcF9yZWdpb25hbF9hc3Nld + HNfYnVja2V0GAQgASgJQh3iPxoSGHRlbXBSZWdpb25hbEFzc2V0c0J1Y2tldFIYdGVtcFJlZ2lvbmFsQXNzZXRzQnVja2V0EkMKE + nBlcm1fYXNzZXRzX2J1Y2tldBgFIAEoCUIV4j8SEhBwZXJtQXNzZXRzQnVja2V0UhBwZXJtQXNzZXRzQnVja2V0EloKG3RlbXBfY + XNzZXRzX2JxX2RhdGFzZXRfbmFtZRgGIAEoCUIc4j8ZEhd0ZW1wQXNzZXRzQnFEYXRhc2V0TmFtZVIXdGVtcEFzc2V0c0JxRGF0Y + XNldE5hbWUSVgoZZW1iZWRkaW5nX2JxX2RhdGFzZXRfbmFtZRgHIAEoCUIb4j8YEhZlbWJlZGRpbmdCcURhdGFzZXROYW1lUhZlb + WJlZGRpbmdCcURhdGFzZXROYW1lElYKGWdjcF9zZXJ2aWNlX2FjY291bnRfZW1haWwYCCABKAlCG+I/GBIWZ2NwU2VydmljZUFjY + 291bnRFbWFpbFIWZ2NwU2VydmljZUFjY291bnRFbWFpbBI8Cg9kYXRhZmxvd19ydW5uZXIYCyABKAlCE+I/EBIOZGF0YWZsb3dSd + W5uZXJSDmRhdGFmbG93UnVubmVyGlcKE1Jlc291cmNlTGFiZWxzRW50cnkSGgoDa2V5GAEgASgJQgjiPwUSA2tleVIDa2V5EiAKB + XZhbHVlGAIgASgJQgriPwcSBXZhbHVlUgV2YWx1ZToCOAEi9wgKEkdpZ2xSZXNvdXJjZUNvbmZpZxJbChpzaGFyZWRfcmVzb3VyY + 2VfY29uZmlnX3VyaRgBIAEoCUIc4j8ZEhdzaGFyZWRSZXNvdXJjZUNvbmZpZ1VyaUgAUhdzaGFyZWRSZXNvdXJjZUNvbmZpZ1Vya + RJ/ChZzaGFyZWRfcmVzb3VyY2VfY29uZmlnGAIgASgLMiwuc25hcGNoYXQucmVzZWFyY2guZ2JtbC5TaGFyZWRSZXNvdXJjZUNvb + mZpZ0IZ4j8WEhRzaGFyZWRSZXNvdXJjZUNvbmZpZ0gAUhRzaGFyZWRSZXNvdXJjZUNvbmZpZxJ4ChNwcmVwcm9jZXNzb3JfY29uZ + mlnGAwgASgLMi4uc25hcGNoYXQucmVzZWFyY2guZ2JtbC5EYXRhUHJlcHJvY2Vzc29yQ29uZmlnQhfiPxQSEnByZXByb2Nlc3Nvc + kNvbmZpZ1IScHJlcHJvY2Vzc29yQ29uZmlnEn8KF3N1YmdyYXBoX3NhbXBsZXJfY29uZmlnGA0gASgLMisuc25hcGNoYXQucmVzZ + WFyY2guZ2JtbC5TcGFya1Jlc291cmNlQ29uZmlnQhriPxcSFXN1YmdyYXBoU2FtcGxlckNvbmZpZ1IVc3ViZ3JhcGhTYW1wbGVyQ + 29uZmlnEnwKFnNwbGl0X2dlbmVyYXRvcl9jb25maWcYDiABKAsyKy5zbmFwY2hhdC5yZXNlYXJjaC5nYm1sLlNwYXJrUmVzb3VyY + 2VDb25maWdCGeI/FhIUc3BsaXRHZW5lcmF0b3JDb25maWdSFHNwbGl0R2VuZXJhdG9yQ29uZmlnEm0KDnRyYWluZXJfY29uZmlnG + A8gASgLMjAuc25hcGNoYXQucmVzZWFyY2guZ2JtbC5EaXN0cmlidXRlZFRyYWluZXJDb25maWdCFBgB4j8PEg10cmFpbmVyQ29uZ + mlnUg10cmFpbmVyQ29uZmlnEnQKEWluZmVyZW5jZXJfY29uZmlnGBAgASgLMi4uc25hcGNoYXQucmVzZWFyY2guZ2JtbC5EYXRhZ + mxvd1Jlc291cmNlQ29uZmlnQhcYAeI/EhIQaW5mZXJlbmNlckNvbmZpZ1IQaW5mZXJlbmNlckNvbmZpZxKBAQoXdHJhaW5lcl9yZ + XNvdXJjZV9jb25maWcYESABKAsyLS5zbmFwY2hhdC5yZXNlYXJjaC5nYm1sLlRyYWluZXJSZXNvdXJjZUNvbmZpZ0Ia4j8XEhV0c + mFpbmVyUmVzb3VyY2VDb25maWdSFXRyYWluZXJSZXNvdXJjZUNvbmZpZxKNAQoaaW5mZXJlbmNlcl9yZXNvdXJjZV9jb25maWcYE + iABKAsyMC5zbmFwY2hhdC5yZXNlYXJjaC5nYm1sLkluZmVyZW5jZXJSZXNvdXJjZUNvbmZpZ0Id4j8aEhhpbmZlcmVuY2VyUmVzb + 3VyY2VDb25maWdSGGluZmVyZW5jZXJSZXNvdXJjZUNvbmZpZ0IRCg9zaGFyZWRfcmVzb3VyY2Uq4wMKCUNvbXBvbmVudBItChFDb + 21wb25lbnRfVW5rbm93bhAAGhbiPxMSEUNvbXBvbmVudF9Vbmtub3duEj8KGkNvbXBvbmVudF9Db25maWdfVmFsaWRhdG9yEAEaH + +I/HBIaQ29tcG9uZW50X0NvbmZpZ19WYWxpZGF0b3ISPwoaQ29tcG9uZW50X0NvbmZpZ19Qb3B1bGF0b3IQAhof4j8cEhpDb21wb + 25lbnRfQ29uZmlnX1BvcHVsYXRvchJBChtDb21wb25lbnRfRGF0YV9QcmVwcm9jZXNzb3IQAxog4j8dEhtDb21wb25lbnRfRGF0Y + V9QcmVwcm9jZXNzb3ISPwoaQ29tcG9uZW50X1N1YmdyYXBoX1NhbXBsZXIQBBof4j8cEhpDb21wb25lbnRfU3ViZ3JhcGhfU2Ftc + GxlchI9ChlDb21wb25lbnRfU3BsaXRfR2VuZXJhdG9yEAUaHuI/GxIZQ29tcG9uZW50X1NwbGl0X0dlbmVyYXRvchItChFDb21wb + 25lbnRfVHJhaW5lchAGGhbiPxMSEUNvbXBvbmVudF9UcmFpbmVyEjMKFENvbXBvbmVudF9JbmZlcmVuY2VyEAcaGeI/FhIUQ29tc + G9uZW50X0luZmVyZW5jZXJiBnByb3RvMw==""" ).mkString) lazy val scalaDescriptor: _root_.scalapb.descriptors.FileDescriptor = { val scalaProto = com.google.protobuf.descriptor.FileDescriptorProto.parseFrom(ProtoBytes) diff --git a/scala_spark35/common/src/main/scala/snapchat/research/gbml/gigl_resource_config/VertexAiResourceConfig.scala b/scala_spark35/common/src/main/scala/snapchat/research/gbml/gigl_resource_config/VertexAiResourceConfig.scala index 8a29093bb..d863014af 100644 --- a/scala_spark35/common/src/main/scala/snapchat/research/gbml/gigl_resource_config/VertexAiResourceConfig.scala +++ b/scala_spark35/common/src/main/scala/snapchat/research/gbml/gigl_resource_config/VertexAiResourceConfig.scala @@ -41,6 +41,15 @@ package snapchat.research.gbml.gigl_resource_config * Format: projects/{project}/locations/{region}/tensorboards/{tensorboard_id} * See https://cloud.google.com/vertex-ai/docs/experiments/tensorboard-overview * for the Tensorboard data model. + * @param tensorboardExperimentName + * Optional. When set, the trainer's chief rank streams events to a + * TensorboardExperiment with this name on the TB resource above, in + * addition to Vertex's per-job auto-upload. Multiple jobs that share this + * value land in the same TensorboardExperiment, so they appear as + * comparable runs on one TensorBoard page. Requires + * tensorboard_resource_name above to be set. Allowed characters: + * lowercase letters, digits, hyphens (Vertex AI Experiment ID rules). + * See https://cloud.google.com/vertex-ai/docs/experiments/tensorboard-overview. */ @SerialVersionUID(0L) final case class VertexAiResourceConfig( @@ -54,6 +63,7 @@ final case class VertexAiResourceConfig( bootDiskSizeGb: _root_.scala.Int = 0, reservationAffinity: _root_.scala.Option[snapchat.research.gbml.gigl_resource_config.VertexAiReservationAffinity] = _root_.scala.None, tensorboardResourceName: _root_.scala.Predef.String = "", + tensorboardExperimentName: _root_.scala.Predef.String = "", unknownFields: _root_.scalapb.UnknownFieldSet = _root_.scalapb.UnknownFieldSet.empty ) extends scalapb.GeneratedMessage with scalapb.lenses.Updatable[VertexAiResourceConfig] { @transient @@ -127,6 +137,13 @@ final case class VertexAiResourceConfig( __size += _root_.com.google.protobuf.CodedOutputStream.computeStringSize(10, __value) } }; + + { + val __value = tensorboardExperimentName + if (!__value.isEmpty) { + __size += _root_.com.google.protobuf.CodedOutputStream.computeStringSize(11, __value) + } + }; __size += unknownFields.serializedSize __size } @@ -200,6 +217,12 @@ final case class VertexAiResourceConfig( _output__.writeString(10, __v) } }; + { + val __v = tensorboardExperimentName + if (!__v.isEmpty) { + _output__.writeString(11, __v) + } + }; unknownFields.writeTo(_output__) } def withMachineType(__v: _root_.scala.Predef.String): VertexAiResourceConfig = copy(machineType = __v) @@ -214,6 +237,7 @@ final case class VertexAiResourceConfig( def clearReservationAffinity: VertexAiResourceConfig = copy(reservationAffinity = _root_.scala.None) def withReservationAffinity(__v: snapchat.research.gbml.gigl_resource_config.VertexAiReservationAffinity): VertexAiResourceConfig = copy(reservationAffinity = Option(__v)) def withTensorboardResourceName(__v: _root_.scala.Predef.String): VertexAiResourceConfig = copy(tensorboardResourceName = __v) + def withTensorboardExperimentName(__v: _root_.scala.Predef.String): VertexAiResourceConfig = copy(tensorboardExperimentName = __v) def withUnknownFields(__v: _root_.scalapb.UnknownFieldSet) = copy(unknownFields = __v) def discardUnknownFields = copy(unknownFields = _root_.scalapb.UnknownFieldSet.empty) def getFieldByNumber(__fieldNumber: _root_.scala.Int): _root_.scala.Any = { @@ -255,6 +279,10 @@ final case class VertexAiResourceConfig( val __t = tensorboardResourceName if (__t != "") __t else null } + case 11 => { + val __t = tensorboardExperimentName + if (__t != "") __t else null + } } } def getField(__field: _root_.scalapb.descriptors.FieldDescriptor): _root_.scalapb.descriptors.PValue = { @@ -270,6 +298,7 @@ final case class VertexAiResourceConfig( case 8 => _root_.scalapb.descriptors.PInt(bootDiskSizeGb) case 9 => reservationAffinity.map(_.toPMessage).getOrElse(_root_.scalapb.descriptors.PEmpty) case 10 => _root_.scalapb.descriptors.PString(tensorboardResourceName) + case 11 => _root_.scalapb.descriptors.PString(tensorboardExperimentName) } } def toProtoString: _root_.scala.Predef.String = _root_.scalapb.TextFormat.printToUnicodeString(this) @@ -290,6 +319,7 @@ object VertexAiResourceConfig extends scalapb.GeneratedMessageCompanion[snapchat var __bootDiskSizeGb: _root_.scala.Int = 0 var __reservationAffinity: _root_.scala.Option[snapchat.research.gbml.gigl_resource_config.VertexAiReservationAffinity] = _root_.scala.None var __tensorboardResourceName: _root_.scala.Predef.String = "" + var __tensorboardExperimentName: _root_.scala.Predef.String = "" var `_unknownFields__`: _root_.scalapb.UnknownFieldSet.Builder = null var _done__ = false while (!_done__) { @@ -316,6 +346,8 @@ object VertexAiResourceConfig extends scalapb.GeneratedMessageCompanion[snapchat __reservationAffinity = Option(__reservationAffinity.fold(_root_.scalapb.LiteParser.readMessage[snapchat.research.gbml.gigl_resource_config.VertexAiReservationAffinity](_input__))(_root_.scalapb.LiteParser.readMessage(_input__, _))) case 82 => __tensorboardResourceName = _input__.readStringRequireUtf8() + case 90 => + __tensorboardExperimentName = _input__.readStringRequireUtf8() case tag => if (_unknownFields__ == null) { _unknownFields__ = new _root_.scalapb.UnknownFieldSet.Builder() @@ -334,6 +366,7 @@ object VertexAiResourceConfig extends scalapb.GeneratedMessageCompanion[snapchat bootDiskSizeGb = __bootDiskSizeGb, reservationAffinity = __reservationAffinity, tensorboardResourceName = __tensorboardResourceName, + tensorboardExperimentName = __tensorboardExperimentName, unknownFields = if (_unknownFields__ == null) _root_.scalapb.UnknownFieldSet.empty else _unknownFields__.result() ) } @@ -350,7 +383,8 @@ object VertexAiResourceConfig extends scalapb.GeneratedMessageCompanion[snapchat schedulingStrategy = __fieldsMap.get(scalaDescriptor.findFieldByNumber(7).get).map(_.as[_root_.scala.Predef.String]).getOrElse(""), bootDiskSizeGb = __fieldsMap.get(scalaDescriptor.findFieldByNumber(8).get).map(_.as[_root_.scala.Int]).getOrElse(0), reservationAffinity = __fieldsMap.get(scalaDescriptor.findFieldByNumber(9).get).flatMap(_.as[_root_.scala.Option[snapchat.research.gbml.gigl_resource_config.VertexAiReservationAffinity]]), - tensorboardResourceName = __fieldsMap.get(scalaDescriptor.findFieldByNumber(10).get).map(_.as[_root_.scala.Predef.String]).getOrElse("") + tensorboardResourceName = __fieldsMap.get(scalaDescriptor.findFieldByNumber(10).get).map(_.as[_root_.scala.Predef.String]).getOrElse(""), + tensorboardExperimentName = __fieldsMap.get(scalaDescriptor.findFieldByNumber(11).get).map(_.as[_root_.scala.Predef.String]).getOrElse("") ) case _ => throw new RuntimeException("Expected PMessage") } @@ -375,7 +409,8 @@ object VertexAiResourceConfig extends scalapb.GeneratedMessageCompanion[snapchat schedulingStrategy = "", bootDiskSizeGb = 0, reservationAffinity = _root_.scala.None, - tensorboardResourceName = "" + tensorboardResourceName = "", + tensorboardExperimentName = "" ) implicit class VertexAiResourceConfigLens[UpperPB](_l: _root_.scalapb.lenses.Lens[UpperPB, snapchat.research.gbml.gigl_resource_config.VertexAiResourceConfig]) extends _root_.scalapb.lenses.ObjectLens[UpperPB, snapchat.research.gbml.gigl_resource_config.VertexAiResourceConfig](_l) { def machineType: _root_.scalapb.lenses.Lens[UpperPB, _root_.scala.Predef.String] = field(_.machineType)((c_, f_) => c_.copy(machineType = f_)) @@ -389,6 +424,7 @@ object VertexAiResourceConfig extends scalapb.GeneratedMessageCompanion[snapchat def reservationAffinity: _root_.scalapb.lenses.Lens[UpperPB, snapchat.research.gbml.gigl_resource_config.VertexAiReservationAffinity] = field(_.getReservationAffinity)((c_, f_) => c_.copy(reservationAffinity = Option(f_))) def optionalReservationAffinity: _root_.scalapb.lenses.Lens[UpperPB, _root_.scala.Option[snapchat.research.gbml.gigl_resource_config.VertexAiReservationAffinity]] = field(_.reservationAffinity)((c_, f_) => c_.copy(reservationAffinity = f_)) def tensorboardResourceName: _root_.scalapb.lenses.Lens[UpperPB, _root_.scala.Predef.String] = field(_.tensorboardResourceName)((c_, f_) => c_.copy(tensorboardResourceName = f_)) + def tensorboardExperimentName: _root_.scalapb.lenses.Lens[UpperPB, _root_.scala.Predef.String] = field(_.tensorboardExperimentName)((c_, f_) => c_.copy(tensorboardExperimentName = f_)) } final val MACHINE_TYPE_FIELD_NUMBER = 1 final val GPU_TYPE_FIELD_NUMBER = 2 @@ -400,6 +436,7 @@ object VertexAiResourceConfig extends scalapb.GeneratedMessageCompanion[snapchat final val BOOT_DISK_SIZE_GB_FIELD_NUMBER = 8 final val RESERVATION_AFFINITY_FIELD_NUMBER = 9 final val TENSORBOARD_RESOURCE_NAME_FIELD_NUMBER = 10 + final val TENSORBOARD_EXPERIMENT_NAME_FIELD_NUMBER = 11 def of( machineType: _root_.scala.Predef.String, gpuType: _root_.scala.Predef.String, @@ -410,7 +447,8 @@ object VertexAiResourceConfig extends scalapb.GeneratedMessageCompanion[snapchat schedulingStrategy: _root_.scala.Predef.String, bootDiskSizeGb: _root_.scala.Int, reservationAffinity: _root_.scala.Option[snapchat.research.gbml.gigl_resource_config.VertexAiReservationAffinity], - tensorboardResourceName: _root_.scala.Predef.String + tensorboardResourceName: _root_.scala.Predef.String, + tensorboardExperimentName: _root_.scala.Predef.String ): _root_.snapchat.research.gbml.gigl_resource_config.VertexAiResourceConfig = _root_.snapchat.research.gbml.gigl_resource_config.VertexAiResourceConfig( machineType, gpuType, @@ -421,7 +459,8 @@ object VertexAiResourceConfig extends scalapb.GeneratedMessageCompanion[snapchat schedulingStrategy, bootDiskSizeGb, reservationAffinity, - tensorboardResourceName + tensorboardResourceName, + tensorboardExperimentName ) // @@protoc_insertion_point(GeneratedMessageCompanion[snapchat.research.gbml.VertexAiResourceConfig]) } diff --git a/snapchat/research/gbml/gbml_config_pb2.py b/snapchat/research/gbml/gbml_config_pb2.py index bcce21dfb..8e5ac8019 100644 --- a/snapchat/research/gbml/gbml_config_pb2.py +++ b/snapchat/research/gbml/gbml_config_pb2.py @@ -21,7 +21,7 @@ from snapchat.research.gbml import subgraph_sampling_strategy_pb2 as snapchat_dot_research_dot_gbml_dot_subgraph__sampling__strategy__pb2 -DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n(snapchat/research/gbml/gbml_config.proto\x12\x16snapchat.research.gbml\x1a)snapchat/research/gbml/graph_schema.proto\x1a\x35snapchat/research/gbml/flattened_graph_metadata.proto\x1a-snapchat/research/gbml/dataset_metadata.proto\x1a\x33snapchat/research/gbml/trained_model_metadata.proto\x1a/snapchat/research/gbml/inference_metadata.proto\x1a\x33snapchat/research/gbml/postprocessed_metadata.proto\x1a\x37snapchat/research/gbml/subgraph_sampling_strategy.proto\"\xb8/\n\nGbmlConfig\x12\x46\n\rtask_metadata\x18\x01 \x01(\x0b\x32/.snapchat.research.gbml.GbmlConfig.TaskMetadata\x12=\n\x0egraph_metadata\x18\x02 \x01(\x0b\x32%.snapchat.research.gbml.GraphMetadata\x12\x46\n\rshared_config\x18\x03 \x01(\x0b\x32/.snapchat.research.gbml.GbmlConfig.SharedConfig\x12H\n\x0e\x64\x61taset_config\x18\x04 \x01(\x0b\x32\x30.snapchat.research.gbml.GbmlConfig.DatasetConfig\x12H\n\x0etrainer_config\x18\x05 \x01(\x0b\x32\x30.snapchat.research.gbml.GbmlConfig.TrainerConfig\x12N\n\x11inferencer_config\x18\x06 \x01(\x0b\x32\x33.snapchat.research.gbml.GbmlConfig.InferencerConfig\x12U\n\x15post_processor_config\x18\t \x01(\x0b\x32\x36.snapchat.research.gbml.GbmlConfig.PostProcessorConfig\x12H\n\x0emetrics_config\x18\x07 \x01(\x0b\x32\x30.snapchat.research.gbml.GbmlConfig.MetricsConfig\x12J\n\x0fprofiler_config\x18\x08 \x01(\x0b\x32\x31.snapchat.research.gbml.GbmlConfig.ProfilerConfig\x12K\n\rfeature_flags\x18\n \x03(\x0b\x32\x34.snapchat.research.gbml.GbmlConfig.FeatureFlagsEntry\x1a\x8f\x05\n\x0cTaskMetadata\x12i\n\x18node_based_task_metadata\x18\x01 \x01(\x0b\x32\x45.snapchat.research.gbml.GbmlConfig.TaskMetadata.NodeBasedTaskMetadataH\x00\x12\x94\x01\n/node_anchor_based_link_prediction_task_metadata\x18\x02 \x01(\x0b\x32Y.snapchat.research.gbml.GbmlConfig.TaskMetadata.NodeAnchorBasedLinkPredictionTaskMetadataH\x00\x12i\n\x18link_based_task_metadata\x18\x03 \x01(\x0b\x32\x45.snapchat.research.gbml.GbmlConfig.TaskMetadata.LinkBasedTaskMetadataH\x00\x1a\x37\n\x15NodeBasedTaskMetadata\x12\x1e\n\x16supervision_node_types\x18\x01 \x03(\t\x1am\n)NodeAnchorBasedLinkPredictionTaskMetadata\x12@\n\x16supervision_edge_types\x18\x01 \x03(\x0b\x32 .snapchat.research.gbml.EdgeType\x1aY\n\x15LinkBasedTaskMetadata\x12@\n\x16supervision_edge_types\x18\x01 \x03(\x0b\x32 .snapchat.research.gbml.EdgeTypeB\x0f\n\rtask_metadata\x1a\x96\x06\n\x0cSharedConfig\x12!\n\x19preprocessed_metadata_uri\x18\x01 \x01(\t\x12P\n\x18\x66lattened_graph_metadata\x18\x02 \x01(\x0b\x32..snapchat.research.gbml.FlattenedGraphMetadata\x12\x41\n\x10\x64\x61taset_metadata\x18\x03 \x01(\x0b\x32\'.snapchat.research.gbml.DatasetMetadata\x12L\n\x16trained_model_metadata\x18\x04 \x01(\x0b\x32,.snapchat.research.gbml.TrainedModelMetadata\x12\x45\n\x12inference_metadata\x18\x05 \x01(\x0b\x32).snapchat.research.gbml.InferenceMetadata\x12M\n\x16postprocessed_metadata\x18\x0c \x01(\x0b\x32-.snapchat.research.gbml.PostProcessedMetadata\x12T\n\x0bshared_args\x18\x06 \x03(\x0b\x32?.snapchat.research.gbml.GbmlConfig.SharedConfig.SharedArgsEntry\x12\x19\n\x11is_graph_directed\x18\x07 \x01(\x08\x12\x1c\n\x14should_skip_training\x18\x08 \x01(\x08\x12\x30\n(should_skip_automatic_temp_asset_cleanup\x18\t \x01(\x08\x12\x1d\n\x15should_skip_inference\x18\n \x01(\x08\x12$\n\x1cshould_skip_model_evaluation\x18\x0b \x01(\x08\x12\x31\n)should_include_isolated_nodes_in_training\x18\r \x01(\x08\x1a\x31\n\x0fSharedArgsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\x1a\xd3\x0c\n\rDatasetConfig\x12i\n\x18\x64\x61ta_preprocessor_config\x18\x01 \x01(\x0b\x32G.snapchat.research.gbml.GbmlConfig.DatasetConfig.DataPreprocessorConfig\x12g\n\x17subgraph_sampler_config\x18\x02 \x01(\x0b\x32\x46.snapchat.research.gbml.GbmlConfig.DatasetConfig.SubgraphSamplerConfig\x12\x65\n\x16split_generator_config\x18\x03 \x01(\x0b\x32\x45.snapchat.research.gbml.GbmlConfig.DatasetConfig.SplitGeneratorConfig\x1a\x84\x02\n\x16\x44\x61taPreprocessorConfig\x12)\n!data_preprocessor_config_cls_path\x18\x01 \x01(\t\x12\x81\x01\n\x16\x64\x61ta_preprocessor_args\x18\x02 \x03(\x0b\x32\x61.snapchat.research.gbml.GbmlConfig.DatasetConfig.DataPreprocessorConfig.DataPreprocessorArgsEntry\x1a;\n\x19\x44\x61taPreprocessorArgsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\x1a\xd0\x04\n\x15SubgraphSamplerConfig\x12\x14\n\x08num_hops\x18\x01 \x01(\rB\x02\x18\x01\x12#\n\x17num_neighbors_to_sample\x18\x02 \x01(\x05\x42\x02\x18\x01\x12T\n\x1asubgraph_sampling_strategy\x18\n \x01(\x0b\x32\x30.snapchat.research.gbml.SubgraphSamplingStrategy\x12\x1c\n\x14num_positive_samples\x18\x03 \x01(\r\x12y\n\x12\x65xperimental_flags\x18\x05 \x03(\x0b\x32].snapchat.research.gbml.GbmlConfig.DatasetConfig.SubgraphSamplerConfig.ExperimentalFlagsEntry\x12*\n\"num_max_training_samples_to_output\x18\x06 \x01(\r\x12-\n!num_user_defined_positive_samples\x18\x07 \x01(\rB\x02\x18\x01\x12-\n!num_user_defined_negative_samples\x18\x08 \x01(\rB\x02\x18\x01\x12I\n\x0fgraph_db_config\x18\t \x01(\x0b\x32\x30.snapchat.research.gbml.GbmlConfig.GraphDBConfig\x1a\x38\n\x16\x45xperimentalFlagsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\x1a\xac\x03\n\x14SplitGeneratorConfig\x12\x1f\n\x17split_strategy_cls_path\x18\x01 \x01(\t\x12y\n\x13split_strategy_args\x18\x02 \x03(\x0b\x32\\.snapchat.research.gbml.GbmlConfig.DatasetConfig.SplitGeneratorConfig.SplitStrategyArgsEntry\x12\x19\n\x11\x61ssigner_cls_path\x18\x03 \x01(\t\x12n\n\rassigner_args\x18\x04 \x03(\x0b\x32W.snapchat.research.gbml.GbmlConfig.DatasetConfig.SplitGeneratorConfig.AssignerArgsEntry\x1a\x38\n\x16SplitStrategyArgsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\x1a\x33\n\x11\x41ssignerArgsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\x1a\x90\x04\n\rGraphDBConfig\x12#\n\x1bgraph_db_ingestion_cls_path\x18\x01 \x01(\t\x12k\n\x17graph_db_ingestion_args\x18\x02 \x03(\x0b\x32J.snapchat.research.gbml.GbmlConfig.GraphDBConfig.GraphDbIngestionArgsEntry\x12X\n\rgraph_db_args\x18\x03 \x03(\x0b\x32\x41.snapchat.research.gbml.GbmlConfig.GraphDBConfig.GraphDbArgsEntry\x12\x66\n\x17graph_db_sampler_config\x18\x04 \x01(\x0b\x32\x45.snapchat.research.gbml.GbmlConfig.GraphDBConfig.GraphDBServiceConfig\x1a;\n\x19GraphDbIngestionArgsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\x1a\x32\n\x10GraphDbArgsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\x1a:\n\x14GraphDBServiceConfig\x12\"\n\x1agraph_db_client_class_path\x18\x01 \x01(\t\x1a\xc1\x01\n\x17GraphStoreStorageConfig\x12\x0f\n\x07\x63ommand\x18\x01 \x01(\t\x12\x61\n\x0cstorage_args\x18\x02 \x03(\x0b\x32K.snapchat.research.gbml.GbmlConfig.GraphStoreStorageConfig.StorageArgsEntry\x1a\x32\n\x10StorageArgsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\x1a\xa7\x03\n\rTrainerConfig\x12\x18\n\x10trainer_cls_path\x18\x01 \x01(\t\x12W\n\x0ctrainer_args\x18\x02 \x03(\x0b\x32\x41.snapchat.research.gbml.GbmlConfig.TrainerConfig.TrainerArgsEntry\x12\x12\n\x08\x63ls_path\x18\x64 \x01(\tH\x00\x12\x11\n\x07\x63ommand\x18\x65 \x01(\tH\x00\x12!\n\x19should_log_to_tensorboard\x18\x0c \x01(\x08\x12#\n\x1btensorboard_experiment_name\x18\x0e \x01(\t\x12`\n\x1agraph_store_storage_config\x18\r \x01(\x0b\x32:.snapchat.research.gbml.GbmlConfig.GraphStoreStorageConfigH\x01\x1a\x32\n\x10TrainerArgsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\x42\x0c\n\nexecutableB\x10\n\x0estorage_config\x1a\x8f\x03\n\x10InferencerConfig\x12`\n\x0finferencer_args\x18\x01 \x03(\x0b\x32G.snapchat.research.gbml.GbmlConfig.InferencerConfig.InferencerArgsEntry\x12\x1b\n\x13inferencer_cls_path\x18\x02 \x01(\t\x12\x12\n\x08\x63ls_path\x18\x64 \x01(\tH\x00\x12\x11\n\x07\x63ommand\x18\x65 \x01(\tH\x00\x12\x1c\n\x14inference_batch_size\x18\x05 \x01(\r\x12`\n\x1agraph_store_storage_config\x18\x06 \x01(\x0b\x32:.snapchat.research.gbml.GbmlConfig.GraphStoreStorageConfigH\x01\x1a\x35\n\x13InferencerArgsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\x42\x0c\n\nexecutableB\x10\n\x0estorage_config\x1a\xdc\x01\n\x13PostProcessorConfig\x12j\n\x13post_processor_args\x18\x01 \x03(\x0b\x32M.snapchat.research.gbml.GbmlConfig.PostProcessorConfig.PostProcessorArgsEntry\x12\x1f\n\x17post_processor_cls_path\x18\x02 \x01(\t\x1a\x38\n\x16PostProcessorArgsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\x1a\xb6\x01\n\rMetricsConfig\x12\x18\n\x10metrics_cls_path\x18\x01 \x01(\t\x12W\n\x0cmetrics_args\x18\x02 \x03(\x0b\x32\x41.snapchat.research.gbml.GbmlConfig.MetricsConfig.MetricsArgsEntry\x1a\x32\n\x10MetricsArgsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\x1a\xdb\x01\n\x0eProfilerConfig\x12\x1e\n\x16should_enable_profiler\x18\x01 \x01(\x08\x12\x18\n\x10profiler_log_dir\x18\x02 \x01(\t\x12Z\n\rprofiler_args\x18\x03 \x03(\x0b\x32\x43.snapchat.research.gbml.GbmlConfig.ProfilerConfig.ProfilerArgsEntry\x1a\x33\n\x11ProfilerArgsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\x1a\x33\n\x11\x46\x65\x61tureFlagsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\x62\x06proto3') +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n(snapchat/research/gbml/gbml_config.proto\x12\x16snapchat.research.gbml\x1a)snapchat/research/gbml/graph_schema.proto\x1a\x35snapchat/research/gbml/flattened_graph_metadata.proto\x1a-snapchat/research/gbml/dataset_metadata.proto\x1a\x33snapchat/research/gbml/trained_model_metadata.proto\x1a/snapchat/research/gbml/inference_metadata.proto\x1a\x33snapchat/research/gbml/postprocessed_metadata.proto\x1a\x37snapchat/research/gbml/subgraph_sampling_strategy.proto\"\x93/\n\nGbmlConfig\x12\x46\n\rtask_metadata\x18\x01 \x01(\x0b\x32/.snapchat.research.gbml.GbmlConfig.TaskMetadata\x12=\n\x0egraph_metadata\x18\x02 \x01(\x0b\x32%.snapchat.research.gbml.GraphMetadata\x12\x46\n\rshared_config\x18\x03 \x01(\x0b\x32/.snapchat.research.gbml.GbmlConfig.SharedConfig\x12H\n\x0e\x64\x61taset_config\x18\x04 \x01(\x0b\x32\x30.snapchat.research.gbml.GbmlConfig.DatasetConfig\x12H\n\x0etrainer_config\x18\x05 \x01(\x0b\x32\x30.snapchat.research.gbml.GbmlConfig.TrainerConfig\x12N\n\x11inferencer_config\x18\x06 \x01(\x0b\x32\x33.snapchat.research.gbml.GbmlConfig.InferencerConfig\x12U\n\x15post_processor_config\x18\t \x01(\x0b\x32\x36.snapchat.research.gbml.GbmlConfig.PostProcessorConfig\x12H\n\x0emetrics_config\x18\x07 \x01(\x0b\x32\x30.snapchat.research.gbml.GbmlConfig.MetricsConfig\x12J\n\x0fprofiler_config\x18\x08 \x01(\x0b\x32\x31.snapchat.research.gbml.GbmlConfig.ProfilerConfig\x12K\n\rfeature_flags\x18\n \x03(\x0b\x32\x34.snapchat.research.gbml.GbmlConfig.FeatureFlagsEntry\x1a\x8f\x05\n\x0cTaskMetadata\x12i\n\x18node_based_task_metadata\x18\x01 \x01(\x0b\x32\x45.snapchat.research.gbml.GbmlConfig.TaskMetadata.NodeBasedTaskMetadataH\x00\x12\x94\x01\n/node_anchor_based_link_prediction_task_metadata\x18\x02 \x01(\x0b\x32Y.snapchat.research.gbml.GbmlConfig.TaskMetadata.NodeAnchorBasedLinkPredictionTaskMetadataH\x00\x12i\n\x18link_based_task_metadata\x18\x03 \x01(\x0b\x32\x45.snapchat.research.gbml.GbmlConfig.TaskMetadata.LinkBasedTaskMetadataH\x00\x1a\x37\n\x15NodeBasedTaskMetadata\x12\x1e\n\x16supervision_node_types\x18\x01 \x03(\t\x1am\n)NodeAnchorBasedLinkPredictionTaskMetadata\x12@\n\x16supervision_edge_types\x18\x01 \x03(\x0b\x32 .snapchat.research.gbml.EdgeType\x1aY\n\x15LinkBasedTaskMetadata\x12@\n\x16supervision_edge_types\x18\x01 \x03(\x0b\x32 .snapchat.research.gbml.EdgeTypeB\x0f\n\rtask_metadata\x1a\x96\x06\n\x0cSharedConfig\x12!\n\x19preprocessed_metadata_uri\x18\x01 \x01(\t\x12P\n\x18\x66lattened_graph_metadata\x18\x02 \x01(\x0b\x32..snapchat.research.gbml.FlattenedGraphMetadata\x12\x41\n\x10\x64\x61taset_metadata\x18\x03 \x01(\x0b\x32\'.snapchat.research.gbml.DatasetMetadata\x12L\n\x16trained_model_metadata\x18\x04 \x01(\x0b\x32,.snapchat.research.gbml.TrainedModelMetadata\x12\x45\n\x12inference_metadata\x18\x05 \x01(\x0b\x32).snapchat.research.gbml.InferenceMetadata\x12M\n\x16postprocessed_metadata\x18\x0c \x01(\x0b\x32-.snapchat.research.gbml.PostProcessedMetadata\x12T\n\x0bshared_args\x18\x06 \x03(\x0b\x32?.snapchat.research.gbml.GbmlConfig.SharedConfig.SharedArgsEntry\x12\x19\n\x11is_graph_directed\x18\x07 \x01(\x08\x12\x1c\n\x14should_skip_training\x18\x08 \x01(\x08\x12\x30\n(should_skip_automatic_temp_asset_cleanup\x18\t \x01(\x08\x12\x1d\n\x15should_skip_inference\x18\n \x01(\x08\x12$\n\x1cshould_skip_model_evaluation\x18\x0b \x01(\x08\x12\x31\n)should_include_isolated_nodes_in_training\x18\r \x01(\x08\x1a\x31\n\x0fSharedArgsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\x1a\xd3\x0c\n\rDatasetConfig\x12i\n\x18\x64\x61ta_preprocessor_config\x18\x01 \x01(\x0b\x32G.snapchat.research.gbml.GbmlConfig.DatasetConfig.DataPreprocessorConfig\x12g\n\x17subgraph_sampler_config\x18\x02 \x01(\x0b\x32\x46.snapchat.research.gbml.GbmlConfig.DatasetConfig.SubgraphSamplerConfig\x12\x65\n\x16split_generator_config\x18\x03 \x01(\x0b\x32\x45.snapchat.research.gbml.GbmlConfig.DatasetConfig.SplitGeneratorConfig\x1a\x84\x02\n\x16\x44\x61taPreprocessorConfig\x12)\n!data_preprocessor_config_cls_path\x18\x01 \x01(\t\x12\x81\x01\n\x16\x64\x61ta_preprocessor_args\x18\x02 \x03(\x0b\x32\x61.snapchat.research.gbml.GbmlConfig.DatasetConfig.DataPreprocessorConfig.DataPreprocessorArgsEntry\x1a;\n\x19\x44\x61taPreprocessorArgsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\x1a\xd0\x04\n\x15SubgraphSamplerConfig\x12\x14\n\x08num_hops\x18\x01 \x01(\rB\x02\x18\x01\x12#\n\x17num_neighbors_to_sample\x18\x02 \x01(\x05\x42\x02\x18\x01\x12T\n\x1asubgraph_sampling_strategy\x18\n \x01(\x0b\x32\x30.snapchat.research.gbml.SubgraphSamplingStrategy\x12\x1c\n\x14num_positive_samples\x18\x03 \x01(\r\x12y\n\x12\x65xperimental_flags\x18\x05 \x03(\x0b\x32].snapchat.research.gbml.GbmlConfig.DatasetConfig.SubgraphSamplerConfig.ExperimentalFlagsEntry\x12*\n\"num_max_training_samples_to_output\x18\x06 \x01(\r\x12-\n!num_user_defined_positive_samples\x18\x07 \x01(\rB\x02\x18\x01\x12-\n!num_user_defined_negative_samples\x18\x08 \x01(\rB\x02\x18\x01\x12I\n\x0fgraph_db_config\x18\t \x01(\x0b\x32\x30.snapchat.research.gbml.GbmlConfig.GraphDBConfig\x1a\x38\n\x16\x45xperimentalFlagsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\x1a\xac\x03\n\x14SplitGeneratorConfig\x12\x1f\n\x17split_strategy_cls_path\x18\x01 \x01(\t\x12y\n\x13split_strategy_args\x18\x02 \x03(\x0b\x32\\.snapchat.research.gbml.GbmlConfig.DatasetConfig.SplitGeneratorConfig.SplitStrategyArgsEntry\x12\x19\n\x11\x61ssigner_cls_path\x18\x03 \x01(\t\x12n\n\rassigner_args\x18\x04 \x03(\x0b\x32W.snapchat.research.gbml.GbmlConfig.DatasetConfig.SplitGeneratorConfig.AssignerArgsEntry\x1a\x38\n\x16SplitStrategyArgsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\x1a\x33\n\x11\x41ssignerArgsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\x1a\x90\x04\n\rGraphDBConfig\x12#\n\x1bgraph_db_ingestion_cls_path\x18\x01 \x01(\t\x12k\n\x17graph_db_ingestion_args\x18\x02 \x03(\x0b\x32J.snapchat.research.gbml.GbmlConfig.GraphDBConfig.GraphDbIngestionArgsEntry\x12X\n\rgraph_db_args\x18\x03 \x03(\x0b\x32\x41.snapchat.research.gbml.GbmlConfig.GraphDBConfig.GraphDbArgsEntry\x12\x66\n\x17graph_db_sampler_config\x18\x04 \x01(\x0b\x32\x45.snapchat.research.gbml.GbmlConfig.GraphDBConfig.GraphDBServiceConfig\x1a;\n\x19GraphDbIngestionArgsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\x1a\x32\n\x10GraphDbArgsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\x1a:\n\x14GraphDBServiceConfig\x12\"\n\x1agraph_db_client_class_path\x18\x01 \x01(\t\x1a\xc1\x01\n\x17GraphStoreStorageConfig\x12\x0f\n\x07\x63ommand\x18\x01 \x01(\t\x12\x61\n\x0cstorage_args\x18\x02 \x03(\x0b\x32K.snapchat.research.gbml.GbmlConfig.GraphStoreStorageConfig.StorageArgsEntry\x1a\x32\n\x10StorageArgsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\x1a\x82\x03\n\rTrainerConfig\x12\x18\n\x10trainer_cls_path\x18\x01 \x01(\t\x12W\n\x0ctrainer_args\x18\x02 \x03(\x0b\x32\x41.snapchat.research.gbml.GbmlConfig.TrainerConfig.TrainerArgsEntry\x12\x12\n\x08\x63ls_path\x18\x64 \x01(\tH\x00\x12\x11\n\x07\x63ommand\x18\x65 \x01(\tH\x00\x12!\n\x19should_log_to_tensorboard\x18\x0c \x01(\x08\x12`\n\x1agraph_store_storage_config\x18\r \x01(\x0b\x32:.snapchat.research.gbml.GbmlConfig.GraphStoreStorageConfigH\x01\x1a\x32\n\x10TrainerArgsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\x42\x0c\n\nexecutableB\x10\n\x0estorage_config\x1a\x8f\x03\n\x10InferencerConfig\x12`\n\x0finferencer_args\x18\x01 \x03(\x0b\x32G.snapchat.research.gbml.GbmlConfig.InferencerConfig.InferencerArgsEntry\x12\x1b\n\x13inferencer_cls_path\x18\x02 \x01(\t\x12\x12\n\x08\x63ls_path\x18\x64 \x01(\tH\x00\x12\x11\n\x07\x63ommand\x18\x65 \x01(\tH\x00\x12\x1c\n\x14inference_batch_size\x18\x05 \x01(\r\x12`\n\x1agraph_store_storage_config\x18\x06 \x01(\x0b\x32:.snapchat.research.gbml.GbmlConfig.GraphStoreStorageConfigH\x01\x1a\x35\n\x13InferencerArgsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\x42\x0c\n\nexecutableB\x10\n\x0estorage_config\x1a\xdc\x01\n\x13PostProcessorConfig\x12j\n\x13post_processor_args\x18\x01 \x03(\x0b\x32M.snapchat.research.gbml.GbmlConfig.PostProcessorConfig.PostProcessorArgsEntry\x12\x1f\n\x17post_processor_cls_path\x18\x02 \x01(\t\x1a\x38\n\x16PostProcessorArgsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\x1a\xb6\x01\n\rMetricsConfig\x12\x18\n\x10metrics_cls_path\x18\x01 \x01(\t\x12W\n\x0cmetrics_args\x18\x02 \x03(\x0b\x32\x41.snapchat.research.gbml.GbmlConfig.MetricsConfig.MetricsArgsEntry\x1a\x32\n\x10MetricsArgsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\x1a\xdb\x01\n\x0eProfilerConfig\x12\x1e\n\x16should_enable_profiler\x18\x01 \x01(\x08\x12\x18\n\x10profiler_log_dir\x18\x02 \x01(\t\x12Z\n\rprofiler_args\x18\x03 \x03(\x0b\x32\x43.snapchat.research.gbml.GbmlConfig.ProfilerConfig.ProfilerArgsEntry\x1a\x33\n\x11ProfilerArgsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\x1a\x33\n\x11\x46\x65\x61tureFlagsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\x62\x06proto3') @@ -352,7 +352,7 @@ _GBMLCONFIG_FEATUREFLAGSENTRY._options = None _GBMLCONFIG_FEATUREFLAGSENTRY._serialized_options = b'8\001' _GBMLCONFIG._serialized_start=426 - _GBMLCONFIG._serialized_end=6498 + _GBMLCONFIG._serialized_end=6461 _GBMLCONFIG_TASKMETADATA._serialized_start=1190 _GBMLCONFIG_TASKMETADATA._serialized_end=1845 _GBMLCONFIG_TASKMETADATA_NODEBASEDTASKMETADATA._serialized_start=1571 @@ -394,25 +394,25 @@ _GBMLCONFIG_GRAPHSTORESTORAGECONFIG_STORAGEARGSENTRY._serialized_start=4937 _GBMLCONFIG_GRAPHSTORESTORAGECONFIG_STORAGEARGSENTRY._serialized_end=4987 _GBMLCONFIG_TRAINERCONFIG._serialized_start=4990 - _GBMLCONFIG_TRAINERCONFIG._serialized_end=5413 - _GBMLCONFIG_TRAINERCONFIG_TRAINERARGSENTRY._serialized_start=5331 - _GBMLCONFIG_TRAINERCONFIG_TRAINERARGSENTRY._serialized_end=5381 - _GBMLCONFIG_INFERENCERCONFIG._serialized_start=5416 - _GBMLCONFIG_INFERENCERCONFIG._serialized_end=5815 - _GBMLCONFIG_INFERENCERCONFIG_INFERENCERARGSENTRY._serialized_start=5730 - _GBMLCONFIG_INFERENCERCONFIG_INFERENCERARGSENTRY._serialized_end=5783 - _GBMLCONFIG_POSTPROCESSORCONFIG._serialized_start=5818 - _GBMLCONFIG_POSTPROCESSORCONFIG._serialized_end=6038 - _GBMLCONFIG_POSTPROCESSORCONFIG_POSTPROCESSORARGSENTRY._serialized_start=5982 - _GBMLCONFIG_POSTPROCESSORCONFIG_POSTPROCESSORARGSENTRY._serialized_end=6038 - _GBMLCONFIG_METRICSCONFIG._serialized_start=6041 - _GBMLCONFIG_METRICSCONFIG._serialized_end=6223 - _GBMLCONFIG_METRICSCONFIG_METRICSARGSENTRY._serialized_start=6173 - _GBMLCONFIG_METRICSCONFIG_METRICSARGSENTRY._serialized_end=6223 - _GBMLCONFIG_PROFILERCONFIG._serialized_start=6226 - _GBMLCONFIG_PROFILERCONFIG._serialized_end=6445 - _GBMLCONFIG_PROFILERCONFIG_PROFILERARGSENTRY._serialized_start=6394 - _GBMLCONFIG_PROFILERCONFIG_PROFILERARGSENTRY._serialized_end=6445 - _GBMLCONFIG_FEATUREFLAGSENTRY._serialized_start=6447 - _GBMLCONFIG_FEATUREFLAGSENTRY._serialized_end=6498 + _GBMLCONFIG_TRAINERCONFIG._serialized_end=5376 + _GBMLCONFIG_TRAINERCONFIG_TRAINERARGSENTRY._serialized_start=5294 + _GBMLCONFIG_TRAINERCONFIG_TRAINERARGSENTRY._serialized_end=5344 + _GBMLCONFIG_INFERENCERCONFIG._serialized_start=5379 + _GBMLCONFIG_INFERENCERCONFIG._serialized_end=5778 + _GBMLCONFIG_INFERENCERCONFIG_INFERENCERARGSENTRY._serialized_start=5693 + _GBMLCONFIG_INFERENCERCONFIG_INFERENCERARGSENTRY._serialized_end=5746 + _GBMLCONFIG_POSTPROCESSORCONFIG._serialized_start=5781 + _GBMLCONFIG_POSTPROCESSORCONFIG._serialized_end=6001 + _GBMLCONFIG_POSTPROCESSORCONFIG_POSTPROCESSORARGSENTRY._serialized_start=5945 + _GBMLCONFIG_POSTPROCESSORCONFIG_POSTPROCESSORARGSENTRY._serialized_end=6001 + _GBMLCONFIG_METRICSCONFIG._serialized_start=6004 + _GBMLCONFIG_METRICSCONFIG._serialized_end=6186 + _GBMLCONFIG_METRICSCONFIG_METRICSARGSENTRY._serialized_start=6136 + _GBMLCONFIG_METRICSCONFIG_METRICSARGSENTRY._serialized_end=6186 + _GBMLCONFIG_PROFILERCONFIG._serialized_start=6189 + _GBMLCONFIG_PROFILERCONFIG._serialized_end=6408 + _GBMLCONFIG_PROFILERCONFIG_PROFILERARGSENTRY._serialized_start=6357 + _GBMLCONFIG_PROFILERCONFIG_PROFILERARGSENTRY._serialized_end=6408 + _GBMLCONFIG_FEATUREFLAGSENTRY._serialized_start=6410 + _GBMLCONFIG_FEATUREFLAGSENTRY._serialized_end=6461 # @@protoc_insertion_point(module_scope) diff --git a/snapchat/research/gbml/gbml_config_pb2.pyi b/snapchat/research/gbml/gbml_config_pb2.pyi index f60ac11bb..98d4ee693 100644 --- a/snapchat/research/gbml/gbml_config_pb2.pyi +++ b/snapchat/research/gbml/gbml_config_pb2.pyi @@ -542,7 +542,6 @@ class GbmlConfig(google.protobuf.message.Message): CLS_PATH_FIELD_NUMBER: builtins.int COMMAND_FIELD_NUMBER: builtins.int SHOULD_LOG_TO_TENSORBOARD_FIELD_NUMBER: builtins.int - TENSORBOARD_EXPERIMENT_NAME_FIELD_NUMBER: builtins.int GRAPH_STORE_STORAGE_CONFIG_FIELD_NUMBER: builtins.int trainer_cls_path: builtins.str """(deprecated) @@ -557,17 +556,6 @@ class GbmlConfig(google.protobuf.message.Message): """Command to use for launching trainer job""" should_log_to_tensorboard: builtins.bool """Weather to log to tensorboard or not (defaults to false)""" - tensorboard_experiment_name: builtins.str - """Optional. When set, the trainer's chief rank streams events to a - TensorboardExperiment with this name on the configured Tensorboard - resource, in addition to Vertex's built-in per-job auto-upload. - Multiple jobs that share the same value land in the same - TensorboardExperiment, so they appear as comparable runs on one - TensorBoard page. Requires - GiglResourceConfig...tensorboard_resource_name to be set. Allowed - characters: lowercase letters, digits, hyphens (Vertex AI Experiment - ID rules). - """ @property def graph_store_storage_config(self) -> global___GbmlConfig.GraphStoreStorageConfig: ... def __init__( @@ -578,11 +566,10 @@ class GbmlConfig(google.protobuf.message.Message): cls_path: builtins.str = ..., command: builtins.str = ..., should_log_to_tensorboard: builtins.bool = ..., - tensorboard_experiment_name: builtins.str = ..., graph_store_storage_config: global___GbmlConfig.GraphStoreStorageConfig | None = ..., ) -> None: ... def HasField(self, field_name: typing_extensions.Literal["cls_path", b"cls_path", "command", b"command", "executable", b"executable", "graph_store_storage_config", b"graph_store_storage_config", "storage_config", b"storage_config"]) -> builtins.bool: ... - def ClearField(self, field_name: typing_extensions.Literal["cls_path", b"cls_path", "command", b"command", "executable", b"executable", "graph_store_storage_config", b"graph_store_storage_config", "should_log_to_tensorboard", b"should_log_to_tensorboard", "storage_config", b"storage_config", "tensorboard_experiment_name", b"tensorboard_experiment_name", "trainer_args", b"trainer_args", "trainer_cls_path", b"trainer_cls_path"]) -> None: ... + def ClearField(self, field_name: typing_extensions.Literal["cls_path", b"cls_path", "command", b"command", "executable", b"executable", "graph_store_storage_config", b"graph_store_storage_config", "should_log_to_tensorboard", b"should_log_to_tensorboard", "storage_config", b"storage_config", "trainer_args", b"trainer_args", "trainer_cls_path", b"trainer_cls_path"]) -> None: ... @typing.overload def WhichOneof(self, oneof_group: typing_extensions.Literal["executable", b"executable"]) -> typing_extensions.Literal["cls_path", "command"] | None: ... @typing.overload diff --git a/snapchat/research/gbml/gigl_resource_config_pb2.py b/snapchat/research/gbml/gigl_resource_config_pb2.py index cf55764c4..e701fd3ef 100644 --- a/snapchat/research/gbml/gigl_resource_config_pb2.py +++ b/snapchat/research/gbml/gigl_resource_config_pb2.py @@ -15,7 +15,7 @@ -DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n1snapchat/research/gbml/gigl_resource_config.proto\x12\x16snapchat.research.gbml\"Y\n\x13SparkResourceConfig\x12\x14\n\x0cmachine_type\x18\x01 \x01(\t\x12\x16\n\x0enum_local_ssds\x18\x02 \x01(\r\x12\x14\n\x0cnum_replicas\x18\x03 \x01(\r\"\x83\x01\n\x16\x44\x61taflowResourceConfig\x12\x13\n\x0bnum_workers\x18\x01 \x01(\r\x12\x17\n\x0fmax_num_workers\x18\x02 \x01(\r\x12\x14\n\x0cmachine_type\x18\x03 \x01(\t\x12\x14\n\x0c\x64isk_size_gb\x18\x04 \x01(\r\x12\x0f\n\x07timeout\x18\x05 \x01(\r\"\xbc\x01\n\x16\x44\x61taPreprocessorConfig\x12P\n\x18\x65\x64ge_preprocessor_config\x18\x01 \x01(\x0b\x32..snapchat.research.gbml.DataflowResourceConfig\x12P\n\x18node_preprocessor_config\x18\x02 \x01(\x0b\x32..snapchat.research.gbml.DataflowResourceConfig\"h\n\x15VertexAiTrainerConfig\x12\x14\n\x0cmachine_type\x18\x01 \x01(\t\x12\x10\n\x08gpu_type\x18\x02 \x01(\t\x12\x11\n\tgpu_limit\x18\x03 \x01(\r\x12\x14\n\x0cnum_replicas\x18\x04 \x01(\r\"z\n\x10KFPTrainerConfig\x12\x13\n\x0b\x63pu_request\x18\x01 \x01(\t\x12\x16\n\x0ememory_request\x18\x02 \x01(\t\x12\x10\n\x08gpu_type\x18\x03 \x01(\t\x12\x11\n\tgpu_limit\x18\x04 \x01(\r\x12\x14\n\x0cnum_replicas\x18\x05 \x01(\r\")\n\x12LocalTrainerConfig\x12\x13\n\x0bnum_workers\x18\x01 \x01(\r\"O\n\x1bVertexAiReservationAffinity\x12\x0c\n\x04type\x18\x01 \x01(\t\x12\"\n\x1areservation_resource_names\x18\x02 \x03(\t\"\xc5\x02\n\x16VertexAiResourceConfig\x12\x14\n\x0cmachine_type\x18\x01 \x01(\t\x12\x10\n\x08gpu_type\x18\x02 \x01(\t\x12\x11\n\tgpu_limit\x18\x03 \x01(\r\x12\x14\n\x0cnum_replicas\x18\x04 \x01(\r\x12\x0f\n\x07timeout\x18\x05 \x01(\r\x12\x1b\n\x13gcp_region_override\x18\x06 \x01(\t\x12\x1b\n\x13scheduling_strategy\x18\x07 \x01(\t\x12\x19\n\x11\x62oot_disk_size_gb\x18\x08 \x01(\r\x12Q\n\x14reservation_affinity\x18\t \x01(\x0b\x32\x33.snapchat.research.gbml.VertexAiReservationAffinity\x12!\n\x19tensorboard_resource_name\x18\n \x01(\t\"{\n\x11KFPResourceConfig\x12\x13\n\x0b\x63pu_request\x18\x01 \x01(\t\x12\x16\n\x0ememory_request\x18\x02 \x01(\t\x12\x10\n\x08gpu_type\x18\x03 \x01(\t\x12\x11\n\tgpu_limit\x18\x04 \x01(\r\x12\x14\n\x0cnum_replicas\x18\x05 \x01(\r\"*\n\x13LocalResourceConfig\x12\x13\n\x0bnum_workers\x18\x01 \x01(\r\"\xd4\x01\n\x18VertexAiGraphStoreConfig\x12H\n\x10graph_store_pool\x18\x01 \x01(\x0b\x32..snapchat.research.gbml.VertexAiResourceConfig\x12\x44\n\x0c\x63ompute_pool\x18\x02 \x01(\x0b\x32..snapchat.research.gbml.VertexAiResourceConfig\x12(\n compute_cluster_local_world_size\x18\x03 \x01(\x05\"\x93\x02\n\x18\x44istributedTrainerConfig\x12Q\n\x18vertex_ai_trainer_config\x18\x01 \x01(\x0b\x32-.snapchat.research.gbml.VertexAiTrainerConfigH\x00\x12\x46\n\x12kfp_trainer_config\x18\x02 \x01(\x0b\x32(.snapchat.research.gbml.KFPTrainerConfigH\x00\x12J\n\x14local_trainer_config\x18\x03 \x01(\x0b\x32*.snapchat.research.gbml.LocalTrainerConfigH\x00\x42\x10\n\x0etrainer_config\"\xf5\x02\n\x15TrainerResourceConfig\x12R\n\x18vertex_ai_trainer_config\x18\x01 \x01(\x0b\x32..snapchat.research.gbml.VertexAiResourceConfigH\x00\x12G\n\x12kfp_trainer_config\x18\x02 \x01(\x0b\x32).snapchat.research.gbml.KFPResourceConfigH\x00\x12K\n\x14local_trainer_config\x18\x03 \x01(\x0b\x32+.snapchat.research.gbml.LocalResourceConfigH\x00\x12`\n$vertex_ai_graph_store_trainer_config\x18\x04 \x01(\x0b\x32\x30.snapchat.research.gbml.VertexAiGraphStoreConfigH\x00\x42\x10\n\x0etrainer_config\"\x91\x03\n\x18InferencerResourceConfig\x12U\n\x1bvertex_ai_inferencer_config\x18\x01 \x01(\x0b\x32..snapchat.research.gbml.VertexAiResourceConfigH\x00\x12T\n\x1a\x64\x61taflow_inferencer_config\x18\x02 \x01(\x0b\x32..snapchat.research.gbml.DataflowResourceConfigH\x00\x12N\n\x17local_inferencer_config\x18\x03 \x01(\x0b\x32+.snapchat.research.gbml.LocalResourceConfigH\x00\x12\x63\n\'vertex_ai_graph_store_inferencer_config\x18\x04 \x01(\x0b\x32\x30.snapchat.research.gbml.VertexAiGraphStoreConfigH\x00\x42\x13\n\x11inferencer_config\"\xa3\x04\n\x14SharedResourceConfig\x12Y\n\x0fresource_labels\x18\x01 \x03(\x0b\x32@.snapchat.research.gbml.SharedResourceConfig.ResourceLabelsEntry\x12_\n\x15\x63ommon_compute_config\x18\x02 \x01(\x0b\x32@.snapchat.research.gbml.SharedResourceConfig.CommonComputeConfig\x1a\x97\x02\n\x13\x43ommonComputeConfig\x12\x0f\n\x07project\x18\x01 \x01(\t\x12\x0e\n\x06region\x18\x02 \x01(\t\x12\x1a\n\x12temp_assets_bucket\x18\x03 \x01(\t\x12#\n\x1btemp_regional_assets_bucket\x18\x04 \x01(\t\x12\x1a\n\x12perm_assets_bucket\x18\x05 \x01(\t\x12#\n\x1btemp_assets_bq_dataset_name\x18\x06 \x01(\t\x12!\n\x19\x65mbedding_bq_dataset_name\x18\x07 \x01(\t\x12!\n\x19gcp_service_account_email\x18\x08 \x01(\t\x12\x17\n\x0f\x64\x61taflow_runner\x18\x0b \x01(\t\x1a\x35\n\x13ResourceLabelsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\"\xc8\x05\n\x12GiglResourceConfig\x12$\n\x1ashared_resource_config_uri\x18\x01 \x01(\tH\x00\x12N\n\x16shared_resource_config\x18\x02 \x01(\x0b\x32,.snapchat.research.gbml.SharedResourceConfigH\x00\x12K\n\x13preprocessor_config\x18\x0c \x01(\x0b\x32..snapchat.research.gbml.DataPreprocessorConfig\x12L\n\x17subgraph_sampler_config\x18\r \x01(\x0b\x32+.snapchat.research.gbml.SparkResourceConfig\x12K\n\x16split_generator_config\x18\x0e \x01(\x0b\x32+.snapchat.research.gbml.SparkResourceConfig\x12L\n\x0etrainer_config\x18\x0f \x01(\x0b\x32\x30.snapchat.research.gbml.DistributedTrainerConfigB\x02\x18\x01\x12M\n\x11inferencer_config\x18\x10 \x01(\x0b\x32..snapchat.research.gbml.DataflowResourceConfigB\x02\x18\x01\x12N\n\x17trainer_resource_config\x18\x11 \x01(\x0b\x32-.snapchat.research.gbml.TrainerResourceConfig\x12T\n\x1ainferencer_resource_config\x18\x12 \x01(\x0b\x32\x30.snapchat.research.gbml.InferencerResourceConfigB\x11\n\x0fshared_resource*\xf3\x01\n\tComponent\x12\x15\n\x11\x43omponent_Unknown\x10\x00\x12\x1e\n\x1a\x43omponent_Config_Validator\x10\x01\x12\x1e\n\x1a\x43omponent_Config_Populator\x10\x02\x12\x1f\n\x1b\x43omponent_Data_Preprocessor\x10\x03\x12\x1e\n\x1a\x43omponent_Subgraph_Sampler\x10\x04\x12\x1d\n\x19\x43omponent_Split_Generator\x10\x05\x12\x15\n\x11\x43omponent_Trainer\x10\x06\x12\x18\n\x14\x43omponent_Inferencer\x10\x07\x62\x06proto3') +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n1snapchat/research/gbml/gigl_resource_config.proto\x12\x16snapchat.research.gbml\"Y\n\x13SparkResourceConfig\x12\x14\n\x0cmachine_type\x18\x01 \x01(\t\x12\x16\n\x0enum_local_ssds\x18\x02 \x01(\r\x12\x14\n\x0cnum_replicas\x18\x03 \x01(\r\"\x83\x01\n\x16\x44\x61taflowResourceConfig\x12\x13\n\x0bnum_workers\x18\x01 \x01(\r\x12\x17\n\x0fmax_num_workers\x18\x02 \x01(\r\x12\x14\n\x0cmachine_type\x18\x03 \x01(\t\x12\x14\n\x0c\x64isk_size_gb\x18\x04 \x01(\r\x12\x0f\n\x07timeout\x18\x05 \x01(\r\"\xbc\x01\n\x16\x44\x61taPreprocessorConfig\x12P\n\x18\x65\x64ge_preprocessor_config\x18\x01 \x01(\x0b\x32..snapchat.research.gbml.DataflowResourceConfig\x12P\n\x18node_preprocessor_config\x18\x02 \x01(\x0b\x32..snapchat.research.gbml.DataflowResourceConfig\"h\n\x15VertexAiTrainerConfig\x12\x14\n\x0cmachine_type\x18\x01 \x01(\t\x12\x10\n\x08gpu_type\x18\x02 \x01(\t\x12\x11\n\tgpu_limit\x18\x03 \x01(\r\x12\x14\n\x0cnum_replicas\x18\x04 \x01(\r\"z\n\x10KFPTrainerConfig\x12\x13\n\x0b\x63pu_request\x18\x01 \x01(\t\x12\x16\n\x0ememory_request\x18\x02 \x01(\t\x12\x10\n\x08gpu_type\x18\x03 \x01(\t\x12\x11\n\tgpu_limit\x18\x04 \x01(\r\x12\x14\n\x0cnum_replicas\x18\x05 \x01(\r\")\n\x12LocalTrainerConfig\x12\x13\n\x0bnum_workers\x18\x01 \x01(\r\"O\n\x1bVertexAiReservationAffinity\x12\x0c\n\x04type\x18\x01 \x01(\t\x12\"\n\x1areservation_resource_names\x18\x02 \x03(\t\"\xea\x02\n\x16VertexAiResourceConfig\x12\x14\n\x0cmachine_type\x18\x01 \x01(\t\x12\x10\n\x08gpu_type\x18\x02 \x01(\t\x12\x11\n\tgpu_limit\x18\x03 \x01(\r\x12\x14\n\x0cnum_replicas\x18\x04 \x01(\r\x12\x0f\n\x07timeout\x18\x05 \x01(\r\x12\x1b\n\x13gcp_region_override\x18\x06 \x01(\t\x12\x1b\n\x13scheduling_strategy\x18\x07 \x01(\t\x12\x19\n\x11\x62oot_disk_size_gb\x18\x08 \x01(\r\x12Q\n\x14reservation_affinity\x18\t \x01(\x0b\x32\x33.snapchat.research.gbml.VertexAiReservationAffinity\x12!\n\x19tensorboard_resource_name\x18\n \x01(\t\x12#\n\x1btensorboard_experiment_name\x18\x0b \x01(\t\"{\n\x11KFPResourceConfig\x12\x13\n\x0b\x63pu_request\x18\x01 \x01(\t\x12\x16\n\x0ememory_request\x18\x02 \x01(\t\x12\x10\n\x08gpu_type\x18\x03 \x01(\t\x12\x11\n\tgpu_limit\x18\x04 \x01(\r\x12\x14\n\x0cnum_replicas\x18\x05 \x01(\r\"*\n\x13LocalResourceConfig\x12\x13\n\x0bnum_workers\x18\x01 \x01(\r\"\xd4\x01\n\x18VertexAiGraphStoreConfig\x12H\n\x10graph_store_pool\x18\x01 \x01(\x0b\x32..snapchat.research.gbml.VertexAiResourceConfig\x12\x44\n\x0c\x63ompute_pool\x18\x02 \x01(\x0b\x32..snapchat.research.gbml.VertexAiResourceConfig\x12(\n compute_cluster_local_world_size\x18\x03 \x01(\x05\"\x93\x02\n\x18\x44istributedTrainerConfig\x12Q\n\x18vertex_ai_trainer_config\x18\x01 \x01(\x0b\x32-.snapchat.research.gbml.VertexAiTrainerConfigH\x00\x12\x46\n\x12kfp_trainer_config\x18\x02 \x01(\x0b\x32(.snapchat.research.gbml.KFPTrainerConfigH\x00\x12J\n\x14local_trainer_config\x18\x03 \x01(\x0b\x32*.snapchat.research.gbml.LocalTrainerConfigH\x00\x42\x10\n\x0etrainer_config\"\xf5\x02\n\x15TrainerResourceConfig\x12R\n\x18vertex_ai_trainer_config\x18\x01 \x01(\x0b\x32..snapchat.research.gbml.VertexAiResourceConfigH\x00\x12G\n\x12kfp_trainer_config\x18\x02 \x01(\x0b\x32).snapchat.research.gbml.KFPResourceConfigH\x00\x12K\n\x14local_trainer_config\x18\x03 \x01(\x0b\x32+.snapchat.research.gbml.LocalResourceConfigH\x00\x12`\n$vertex_ai_graph_store_trainer_config\x18\x04 \x01(\x0b\x32\x30.snapchat.research.gbml.VertexAiGraphStoreConfigH\x00\x42\x10\n\x0etrainer_config\"\x91\x03\n\x18InferencerResourceConfig\x12U\n\x1bvertex_ai_inferencer_config\x18\x01 \x01(\x0b\x32..snapchat.research.gbml.VertexAiResourceConfigH\x00\x12T\n\x1a\x64\x61taflow_inferencer_config\x18\x02 \x01(\x0b\x32..snapchat.research.gbml.DataflowResourceConfigH\x00\x12N\n\x17local_inferencer_config\x18\x03 \x01(\x0b\x32+.snapchat.research.gbml.LocalResourceConfigH\x00\x12\x63\n\'vertex_ai_graph_store_inferencer_config\x18\x04 \x01(\x0b\x32\x30.snapchat.research.gbml.VertexAiGraphStoreConfigH\x00\x42\x13\n\x11inferencer_config\"\xa3\x04\n\x14SharedResourceConfig\x12Y\n\x0fresource_labels\x18\x01 \x03(\x0b\x32@.snapchat.research.gbml.SharedResourceConfig.ResourceLabelsEntry\x12_\n\x15\x63ommon_compute_config\x18\x02 \x01(\x0b\x32@.snapchat.research.gbml.SharedResourceConfig.CommonComputeConfig\x1a\x97\x02\n\x13\x43ommonComputeConfig\x12\x0f\n\x07project\x18\x01 \x01(\t\x12\x0e\n\x06region\x18\x02 \x01(\t\x12\x1a\n\x12temp_assets_bucket\x18\x03 \x01(\t\x12#\n\x1btemp_regional_assets_bucket\x18\x04 \x01(\t\x12\x1a\n\x12perm_assets_bucket\x18\x05 \x01(\t\x12#\n\x1btemp_assets_bq_dataset_name\x18\x06 \x01(\t\x12!\n\x19\x65mbedding_bq_dataset_name\x18\x07 \x01(\t\x12!\n\x19gcp_service_account_email\x18\x08 \x01(\t\x12\x17\n\x0f\x64\x61taflow_runner\x18\x0b \x01(\t\x1a\x35\n\x13ResourceLabelsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\"\xc8\x05\n\x12GiglResourceConfig\x12$\n\x1ashared_resource_config_uri\x18\x01 \x01(\tH\x00\x12N\n\x16shared_resource_config\x18\x02 \x01(\x0b\x32,.snapchat.research.gbml.SharedResourceConfigH\x00\x12K\n\x13preprocessor_config\x18\x0c \x01(\x0b\x32..snapchat.research.gbml.DataPreprocessorConfig\x12L\n\x17subgraph_sampler_config\x18\r \x01(\x0b\x32+.snapchat.research.gbml.SparkResourceConfig\x12K\n\x16split_generator_config\x18\x0e \x01(\x0b\x32+.snapchat.research.gbml.SparkResourceConfig\x12L\n\x0etrainer_config\x18\x0f \x01(\x0b\x32\x30.snapchat.research.gbml.DistributedTrainerConfigB\x02\x18\x01\x12M\n\x11inferencer_config\x18\x10 \x01(\x0b\x32..snapchat.research.gbml.DataflowResourceConfigB\x02\x18\x01\x12N\n\x17trainer_resource_config\x18\x11 \x01(\x0b\x32-.snapchat.research.gbml.TrainerResourceConfig\x12T\n\x1ainferencer_resource_config\x18\x12 \x01(\x0b\x32\x30.snapchat.research.gbml.InferencerResourceConfigB\x11\n\x0fshared_resource*\xf3\x01\n\tComponent\x12\x15\n\x11\x43omponent_Unknown\x10\x00\x12\x1e\n\x1a\x43omponent_Config_Validator\x10\x01\x12\x1e\n\x1a\x43omponent_Config_Populator\x10\x02\x12\x1f\n\x1b\x43omponent_Data_Preprocessor\x10\x03\x12\x1e\n\x1a\x43omponent_Subgraph_Sampler\x10\x04\x12\x1d\n\x19\x43omponent_Split_Generator\x10\x05\x12\x15\n\x11\x43omponent_Trainer\x10\x06\x12\x18\n\x14\x43omponent_Inferencer\x10\x07\x62\x06proto3') _COMPONENT = DESCRIPTOR.enum_types_by_name['Component'] Component = enum_type_wrapper.EnumTypeWrapper(_COMPONENT) @@ -184,8 +184,8 @@ _GIGLRESOURCECONFIG.fields_by_name['trainer_config']._serialized_options = b'\030\001' _GIGLRESOURCECONFIG.fields_by_name['inferencer_config']._options = None _GIGLRESOURCECONFIG.fields_by_name['inferencer_config']._serialized_options = b'\030\001' - _COMPONENT._serialized_start=3883 - _COMPONENT._serialized_end=4126 + _COMPONENT._serialized_start=3920 + _COMPONENT._serialized_end=4163 _SPARKRESOURCECONFIG._serialized_start=77 _SPARKRESOURCECONFIG._serialized_end=166 _DATAFLOWRESOURCECONFIG._serialized_start=169 @@ -201,25 +201,25 @@ _VERTEXAIRESERVATIONAFFINITY._serialized_start=766 _VERTEXAIRESERVATIONAFFINITY._serialized_end=845 _VERTEXAIRESOURCECONFIG._serialized_start=848 - _VERTEXAIRESOURCECONFIG._serialized_end=1173 - _KFPRESOURCECONFIG._serialized_start=1175 - _KFPRESOURCECONFIG._serialized_end=1298 - _LOCALRESOURCECONFIG._serialized_start=1300 - _LOCALRESOURCECONFIG._serialized_end=1342 - _VERTEXAIGRAPHSTORECONFIG._serialized_start=1345 - _VERTEXAIGRAPHSTORECONFIG._serialized_end=1557 - _DISTRIBUTEDTRAINERCONFIG._serialized_start=1560 - _DISTRIBUTEDTRAINERCONFIG._serialized_end=1835 - _TRAINERRESOURCECONFIG._serialized_start=1838 - _TRAINERRESOURCECONFIG._serialized_end=2211 - _INFERENCERRESOURCECONFIG._serialized_start=2214 - _INFERENCERRESOURCECONFIG._serialized_end=2615 - _SHAREDRESOURCECONFIG._serialized_start=2618 - _SHAREDRESOURCECONFIG._serialized_end=3165 - _SHAREDRESOURCECONFIG_COMMONCOMPUTECONFIG._serialized_start=2831 - _SHAREDRESOURCECONFIG_COMMONCOMPUTECONFIG._serialized_end=3110 - _SHAREDRESOURCECONFIG_RESOURCELABELSENTRY._serialized_start=3112 - _SHAREDRESOURCECONFIG_RESOURCELABELSENTRY._serialized_end=3165 - _GIGLRESOURCECONFIG._serialized_start=3168 - _GIGLRESOURCECONFIG._serialized_end=3880 + _VERTEXAIRESOURCECONFIG._serialized_end=1210 + _KFPRESOURCECONFIG._serialized_start=1212 + _KFPRESOURCECONFIG._serialized_end=1335 + _LOCALRESOURCECONFIG._serialized_start=1337 + _LOCALRESOURCECONFIG._serialized_end=1379 + _VERTEXAIGRAPHSTORECONFIG._serialized_start=1382 + _VERTEXAIGRAPHSTORECONFIG._serialized_end=1594 + _DISTRIBUTEDTRAINERCONFIG._serialized_start=1597 + _DISTRIBUTEDTRAINERCONFIG._serialized_end=1872 + _TRAINERRESOURCECONFIG._serialized_start=1875 + _TRAINERRESOURCECONFIG._serialized_end=2248 + _INFERENCERRESOURCECONFIG._serialized_start=2251 + _INFERENCERRESOURCECONFIG._serialized_end=2652 + _SHAREDRESOURCECONFIG._serialized_start=2655 + _SHAREDRESOURCECONFIG._serialized_end=3202 + _SHAREDRESOURCECONFIG_COMMONCOMPUTECONFIG._serialized_start=2868 + _SHAREDRESOURCECONFIG_COMMONCOMPUTECONFIG._serialized_end=3147 + _SHAREDRESOURCECONFIG_RESOURCELABELSENTRY._serialized_start=3149 + _SHAREDRESOURCECONFIG_RESOURCELABELSENTRY._serialized_end=3202 + _GIGLRESOURCECONFIG._serialized_start=3205 + _GIGLRESOURCECONFIG._serialized_end=3917 # @@protoc_insertion_point(module_scope) diff --git a/snapchat/research/gbml/gigl_resource_config_pb2.pyi b/snapchat/research/gbml/gigl_resource_config_pb2.pyi index 250c69973..09ddb04c0 100644 --- a/snapchat/research/gbml/gigl_resource_config_pb2.pyi +++ b/snapchat/research/gbml/gigl_resource_config_pb2.pyi @@ -260,6 +260,7 @@ class VertexAiResourceConfig(google.protobuf.message.Message): BOOT_DISK_SIZE_GB_FIELD_NUMBER: builtins.int RESERVATION_AFFINITY_FIELD_NUMBER: builtins.int TENSORBOARD_RESOURCE_NAME_FIELD_NUMBER: builtins.int + TENSORBOARD_EXPERIMENT_NAME_FIELD_NUMBER: builtins.int machine_type: builtins.str """Machine type for job""" gpu_type: builtins.str @@ -301,6 +302,16 @@ class VertexAiResourceConfig(google.protobuf.message.Message): See https://cloud.google.com/vertex-ai/docs/experiments/tensorboard-overview for the Tensorboard data model. """ + tensorboard_experiment_name: builtins.str + """Optional. When set, the trainer's chief rank streams events to a + TensorboardExperiment with this name on the TB resource above, in + addition to Vertex's per-job auto-upload. Multiple jobs that share this + value land in the same TensorboardExperiment, so they appear as + comparable runs on one TensorBoard page. Requires + tensorboard_resource_name above to be set. Allowed characters: + lowercase letters, digits, hyphens (Vertex AI Experiment ID rules). + See https://cloud.google.com/vertex-ai/docs/experiments/tensorboard-overview. + """ def __init__( self, *, @@ -314,9 +325,10 @@ class VertexAiResourceConfig(google.protobuf.message.Message): boot_disk_size_gb: builtins.int = ..., reservation_affinity: global___VertexAiReservationAffinity | None = ..., tensorboard_resource_name: builtins.str = ..., + tensorboard_experiment_name: builtins.str = ..., ) -> None: ... def HasField(self, field_name: typing_extensions.Literal["reservation_affinity", b"reservation_affinity"]) -> builtins.bool: ... - def ClearField(self, field_name: typing_extensions.Literal["boot_disk_size_gb", b"boot_disk_size_gb", "gcp_region_override", b"gcp_region_override", "gpu_limit", b"gpu_limit", "gpu_type", b"gpu_type", "machine_type", b"machine_type", "num_replicas", b"num_replicas", "reservation_affinity", b"reservation_affinity", "scheduling_strategy", b"scheduling_strategy", "tensorboard_resource_name", b"tensorboard_resource_name", "timeout", b"timeout"]) -> None: ... + def ClearField(self, field_name: typing_extensions.Literal["boot_disk_size_gb", b"boot_disk_size_gb", "gcp_region_override", b"gcp_region_override", "gpu_limit", b"gpu_limit", "gpu_type", b"gpu_type", "machine_type", b"machine_type", "num_replicas", b"num_replicas", "reservation_affinity", b"reservation_affinity", "scheduling_strategy", b"scheduling_strategy", "tensorboard_experiment_name", b"tensorboard_experiment_name", "tensorboard_resource_name", b"tensorboard_resource_name", "timeout", b"timeout"]) -> None: ... global___VertexAiResourceConfig = VertexAiResourceConfig diff --git a/tests/unit/src/common/vertex_ai_launcher_test.py b/tests/unit/src/common/vertex_ai_launcher_test.py index 5409951e6..6a54f9027 100644 --- a/tests/unit/src/common/vertex_ai_launcher_test.py +++ b/tests/unit/src/common/vertex_ai_launcher_test.py @@ -337,20 +337,10 @@ def test_launch_inference_single_pool_cpu(self, mock_vertex_ai_service_class): self.assertEqual(job_config.labels, expected_labels) @patch("gigl.src.common.vertex_ai_launcher.VertexAIService") - def test_launch_single_pool_job_threads_experiment_name( + def test_launch_single_pool_job_reads_experiment_name_from_resource_config( self, mock_vertex_ai_service_class ): - """Test that tensorboard_experiment_name is forwarded to the VertexAiJobConfig - when passed to launch_single_pool_job.""" - job_name = "test-single-pool-tb-exp" - task_config_uri = Uri("gs://bucket/task_config.yaml") - resource_config_uri = Uri("gs://bucket/resource_config.yaml") - process_command = "python -m gigl.src.training.v2.glt_trainer" - process_runtime_args: dict[str, str] = {} - cpu_docker_uri = "gcr.io/project/cpu-image:tag" - cuda_docker_uri = "gcr.io/project/cuda-image:tag" - component = GiGLComponents.Trainer - vertex_ai_region = "us-central1" + """tensorboard_experiment_name on the resource config flows to the VertexAiJobConfig.""" experiment_name = "my-single-pool-experiment" gigl_resource_config_proto = ( @@ -362,24 +352,24 @@ def test_launch_single_pool_job_threads_experiment_name( resource_config=gigl_resource_config_proto ) vertex_ai_config = gigl_resource_config_proto.inferencer_resource_config.vertex_ai_inferencer_config + vertex_ai_config.tensorboard_experiment_name = experiment_name mock_service_instance = Mock() mock_vertex_ai_service_class.return_value = mock_service_instance launch_single_pool_job( vertex_ai_resource_config=vertex_ai_config, - job_name=job_name, - task_config_uri=task_config_uri, - resource_config_uri=resource_config_uri, - process_command=process_command, - process_runtime_args=process_runtime_args, + job_name="test-single-pool-tb-exp", + task_config_uri=Uri("gs://bucket/task_config.yaml"), + resource_config_uri=Uri("gs://bucket/resource_config.yaml"), + process_command="python -m gigl.src.training.v2.glt_trainer", + process_runtime_args={}, resource_config_wrapper=resource_config_wrapper, - cpu_docker_uri=cpu_docker_uri, - cuda_docker_uri=cuda_docker_uri, - component=component, - vertex_ai_region=vertex_ai_region, + cpu_docker_uri="gcr.io/project/cpu-image:tag", + cuda_docker_uri="gcr.io/project/cuda-image:tag", + component=GiGLComponents.Trainer, + vertex_ai_region="us-central1", tensorboard_logs_uri=Uri("gs://bucket/job/trainer/logs/"), - tensorboard_experiment_name=experiment_name, ) mock_service_instance.launch_job.assert_called_once() @@ -388,19 +378,12 @@ def test_launch_single_pool_job_threads_experiment_name( self.assertEqual(job_config.tensorboard_experiment_name, experiment_name) @patch("gigl.src.common.vertex_ai_launcher.VertexAIService") - def test_launch_graph_store_job_threads_experiment_name_to_compute_pool_only( + def test_launch_graph_store_job_reads_experiment_name_from_compute_pool( self, mock_vertex_ai_service_class ): - """Test that tensorboard_experiment_name is forwarded to the compute pool's - VertexAiJobConfig but NOT to the storage pool's VertexAiJobConfig.""" - job_name = "test-graph-store-tb-exp" - task_config_uri = Uri("gs://bucket/task_config.yaml") - resource_config_uri = Uri("gs://bucket/resource_config.yaml") - process_command = "python -m gigl.src.training.v2.glt_trainer" - process_runtime_args: dict[str, str] = {} - cpu_docker_uri = "gcr.io/project/cpu-image:tag" - cuda_docker_uri = "gcr.io/project/cuda-image:tag" - component = GiGLComponents.Trainer + """compute_pool.tensorboard_experiment_name flows to the compute pool's + VertexAiJobConfig; storage pool stays empty. + """ experiment_name = "my-graph-store-experiment" gigl_resource_config_proto = _create_gigl_resource_config_with_graph_store( @@ -410,25 +393,25 @@ def test_launch_graph_store_job_threads_experiment_name_to_compute_pool_only( resource_config=gigl_resource_config_proto ) graph_store_config = gigl_resource_config_proto.trainer_resource_config.vertex_ai_graph_store_trainer_config + graph_store_config.compute_pool.tensorboard_experiment_name = experiment_name mock_service_instance = Mock() mock_vertex_ai_service_class.return_value = mock_service_instance launch_graph_store_enabled_job( vertex_ai_graph_store_config=graph_store_config, - job_name=job_name, - task_config_uri=task_config_uri, - resource_config_uri=resource_config_uri, - compute_commmand=process_command, - compute_runtime_args=process_runtime_args, + job_name="test-graph-store-tb-exp", + task_config_uri=Uri("gs://bucket/task_config.yaml"), + resource_config_uri=Uri("gs://bucket/resource_config.yaml"), + compute_commmand="python -m gigl.src.training.v2.glt_trainer", + compute_runtime_args={}, resource_config_wrapper=resource_config_wrapper, storage_command="python -m gigl.distributed.graph_store.storage_main", storage_args={}, - cpu_docker_uri=cpu_docker_uri, - cuda_docker_uri=cuda_docker_uri, - component=component, + cpu_docker_uri="gcr.io/project/cpu-image:tag", + cuda_docker_uri="gcr.io/project/cuda-image:tag", + component=GiGLComponents.Trainer, tensorboard_logs_uri=Uri("gs://bucket/job/trainer/logs/"), - tensorboard_experiment_name=experiment_name, ) mock_service_instance.launch_graph_store_job.assert_called_once() @@ -436,21 +419,20 @@ def test_launch_graph_store_job_threads_experiment_name_to_compute_pool_only( compute_job_config = call_args.kwargs["compute_pool_job_config"] storage_job_config = call_args.kwargs["storage_pool_job_config"] - # Compute pool SHOULD have the experiment name self.assertEqual( compute_job_config.tensorboard_experiment_name, experiment_name ) - # Storage pool MUST NOT have the experiment name self.assertIsNone(storage_job_config.tensorboard_experiment_name) def test_build_job_config_threads_experiment_name(self) -> None: - """Test that tensorboard_experiment_name is forwarded to VertexAiJobConfig.""" + """tensorboard_experiment_name on the resource config flows to VertexAiJobConfig.""" resource_config = gigl_resource_config_pb2.VertexAiResourceConfig( machine_type="n1-standard-4", gpu_type="ACCELERATOR_TYPE_UNSPECIFIED", gpu_limit=0, num_replicas=1, tensorboard_resource_name="projects/p/locations/us/tensorboards/1", + tensorboard_experiment_name="my-comparison", ) cfg = _build_job_config( job_name="job", @@ -463,7 +445,6 @@ def test_build_job_config_threads_experiment_name(self) -> None: vertex_ai_resource_config=resource_config, env_vars=[], tensorboard_logs_uri=Uri("gs://b/run/logs/"), - tensorboard_experiment_name="my-comparison", ) self.assertEqual(cfg.tensorboard_experiment_name, "my-comparison") @@ -499,6 +480,7 @@ def test_build_job_config_injects_gigl_tensorboard_env_vars(self) -> None: gpu_limit=0, num_replicas=1, tensorboard_resource_name="projects/p/locations/us/tensorboards/1", + tensorboard_experiment_name="my-comparison", ) cfg = _build_job_config( job_name="gigl_train_some_task", @@ -511,7 +493,6 @@ def test_build_job_config_injects_gigl_tensorboard_env_vars(self) -> None: vertex_ai_resource_config=resource_config, env_vars=[], tensorboard_logs_uri=Uri("gs://b/run/logs/"), - tensorboard_experiment_name="my-comparison", ) env = {ev.name: ev.value for ev in cfg.environment_variables or []} self.assertEqual( @@ -532,6 +513,7 @@ def test_build_job_config_run_name_is_unique_per_call(self) -> None: gpu_limit=0, num_replicas=1, tensorboard_resource_name="projects/p/locations/us/tensorboards/1", + tensorboard_experiment_name="my-comparison", ) kwargs = dict( job_name="gigl_train_same_name", @@ -544,7 +526,6 @@ def test_build_job_config_run_name_is_unique_per_call(self) -> None: vertex_ai_resource_config=resource_config, env_vars=[], tensorboard_logs_uri=Uri("gs://b/run/logs/"), - tensorboard_experiment_name="my-comparison", ) first = _build_job_config(**kwargs) # type: ignore[arg-type] # Sleep one second so the timestamp suffix changes deterministically. diff --git a/tests/unit/src/training/glt_trainer_test.py b/tests/unit/src/training/glt_trainer_test.py index f77f2847e..1a6d246ff 100644 --- a/tests/unit/src/training/glt_trainer_test.py +++ b/tests/unit/src/training/glt_trainer_test.py @@ -1,4 +1,10 @@ -"""Unit tests for GLTTrainer — verifies tensorboard_experiment_name forwarding.""" +"""Unit tests for GLTTrainer dispatch. + +The trainer no longer extracts ``tensorboard_experiment_name`` from +``GbmlConfig``; that field now lives on ``VertexAiResourceConfig`` and the +launcher reads it directly. These tests confirm the trainer dispatches to +the right launcher based on ``trainer_config`` type. +""" from unittest.mock import MagicMock, patch @@ -42,103 +48,64 @@ def _make_resource_config_wrapper_with_graph_store() -> MagicMock: return mock_wrapper -def _make_gbml_config_pb_wrapper(experiment_name: str = "my-comparison") -> MagicMock: - """Return a GbmlConfigPbWrapper mock with tensorboard_experiment_name set.""" +def _make_gbml_config_pb_wrapper() -> MagicMock: + """Return a minimal GbmlConfigPbWrapper mock for trainer dispatch.""" trainer_config_proto = gbml_config_pb2.GbmlConfig.TrainerConfig( command="python -m gigl.src.training.v2.glt_trainer", - tensorboard_experiment_name=experiment_name, ) - mock_wrapper = MagicMock() mock_wrapper.trainer_config = trainer_config_proto - # Ensure tensorboard_logs_uri is empty so UriFactory is not called. mock_wrapper.shared_config.trained_model_metadata.tensorboard_logs_uri = "" return mock_wrapper -class TestGltTrainerExperimentNameForwarding(TestCase): - """Tests that GLTTrainer forwards tensorboard_experiment_name to the launcher.""" +class TestGltTrainerDispatch(TestCase): + """Tests that GLTTrainer dispatches to the correct launcher entry point.""" @patch("gigl.src.training.v2.glt_trainer.launch_single_pool_job") @patch("gigl.src.training.v2.glt_trainer.GbmlConfigPbWrapper") @patch("gigl.src.training.v2.glt_trainer.get_resource_config") - def test_single_pool_forwards_experiment_name( + def test_single_pool_resource_config_dispatches_to_single_pool_launcher( self, - mock_get_resource_config, - mock_gbml_config_cls, - mock_launch_single_pool_job, + mock_get_resource_config: MagicMock, + mock_gbml_config_cls: MagicMock, + mock_launch_single_pool_job: MagicMock, ) -> None: - """launch_single_pool_job receives tensorboard_experiment_name='my-comparison'.""" mock_get_resource_config.return_value = ( _make_resource_config_wrapper_with_single_pool() ) mock_gbml_config_cls.get_gbml_config_pb_wrapper_from_uri.return_value = ( - _make_gbml_config_pb_wrapper("my-comparison") + _make_gbml_config_pb_wrapper() ) - trainer = GLTTrainer() - trainer.run( + GLTTrainer().run( applied_task_identifier=AppliedTaskIdentifier("test-job"), task_config_uri=UriFactory.create_uri("gs://bucket/task.yaml"), resource_config_uri=UriFactory.create_uri("gs://bucket/resource.yaml"), ) mock_launch_single_pool_job.assert_called_once() - call_kwargs = mock_launch_single_pool_job.call_args.kwargs - self.assertEqual(call_kwargs["tensorboard_experiment_name"], "my-comparison") @patch("gigl.src.training.v2.glt_trainer.launch_graph_store_enabled_job") @patch("gigl.src.training.v2.glt_trainer.GbmlConfigPbWrapper") @patch("gigl.src.training.v2.glt_trainer.get_resource_config") - def test_graph_store_forwards_experiment_name( + def test_graph_store_resource_config_dispatches_to_graph_store_launcher( self, - mock_get_resource_config, - mock_gbml_config_cls, - mock_launch_graph_store_enabled_job, + mock_get_resource_config: MagicMock, + mock_gbml_config_cls: MagicMock, + mock_launch_graph_store_enabled_job: MagicMock, ) -> None: - """launch_graph_store_enabled_job receives tensorboard_experiment_name='my-comparison'.""" mock_get_resource_config.return_value = ( _make_resource_config_wrapper_with_graph_store() ) mock_gbml_config_cls.get_gbml_config_pb_wrapper_from_uri.return_value = ( - _make_gbml_config_pb_wrapper("my-comparison") + _make_gbml_config_pb_wrapper() ) - trainer = GLTTrainer() - trainer.run( + GLTTrainer().run( applied_task_identifier=AppliedTaskIdentifier("test-job"), task_config_uri=UriFactory.create_uri("gs://bucket/task.yaml"), resource_config_uri=UriFactory.create_uri("gs://bucket/resource.yaml"), ) mock_launch_graph_store_enabled_job.assert_called_once() - call_kwargs = mock_launch_graph_store_enabled_job.call_args.kwargs - self.assertEqual(call_kwargs["tensorboard_experiment_name"], "my-comparison") - - @patch("gigl.src.training.v2.glt_trainer.launch_single_pool_job") - @patch("gigl.src.training.v2.glt_trainer.GbmlConfigPbWrapper") - @patch("gigl.src.training.v2.glt_trainer.get_resource_config") - def test_single_pool_empty_experiment_name_becomes_none( - self, - mock_get_resource_config, - mock_gbml_config_cls, - mock_launch_single_pool_job, - ) -> None: - """Empty string tensorboard_experiment_name is coerced to None.""" - mock_get_resource_config.return_value = ( - _make_resource_config_wrapper_with_single_pool() - ) - mock_gbml_config_cls.get_gbml_config_pb_wrapper_from_uri.return_value = ( - _make_gbml_config_pb_wrapper("") # proto default empty string - ) - - trainer = GLTTrainer() - trainer.run( - applied_task_identifier=AppliedTaskIdentifier("test-job"), - task_config_uri=UriFactory.create_uri("gs://bucket/task.yaml"), - resource_config_uri=UriFactory.create_uri("gs://bucket/resource.yaml"), - ) - - mock_launch_single_pool_job.assert_called_once() - call_kwargs = mock_launch_single_pool_job.call_args.kwargs - self.assertIsNone(call_kwargs["tensorboard_experiment_name"]) diff --git a/tests/unit/src/validation/lib/gbml_and_resource_config_compatibility_checks_test.py b/tests/unit/src/validation/lib/gbml_and_resource_config_compatibility_checks_test.py index e0bcc44e8..609c47676 100644 --- a/tests/unit/src/validation/lib/gbml_and_resource_config_compatibility_checks_test.py +++ b/tests/unit/src/validation/lib/gbml_and_resource_config_compatibility_checks_test.py @@ -102,13 +102,9 @@ def _create_gbml_config_with_tensorboard_enabled() -> GbmlConfigPbWrapper: return GbmlConfigPbWrapper(gbml_config_pb=gbml_config) -def _create_gbml_config_with_tensorboard_experiment_name( - experiment_name: str = "my-comparison", -) -> GbmlConfigPbWrapper: - """Create a GbmlConfig with trainer tensorboard_experiment_name set.""" - gbml_config = gbml_config_pb2.GbmlConfig() - gbml_config.trainer_config.tensorboard_experiment_name = experiment_name - return GbmlConfigPbWrapper(gbml_config_pb=gbml_config) +def _create_empty_gbml_config() -> GbmlConfigPbWrapper: + """Create a minimal GbmlConfig (no flags set).""" + return GbmlConfigPbWrapper(gbml_config_pb=gbml_config_pb2.GbmlConfig()) def _create_resource_config_with_both_graph_stores() -> GiglResourceConfigWrapper: @@ -146,6 +142,7 @@ def _create_resource_config_without_graph_stores() -> GiglResourceConfigWrapper: def _create_resource_config_with_trainer_tensorboard( *, tensorboard_resource_name: str, + tensorboard_experiment_name: str = "", use_graph_store: bool = False, ) -> GiglResourceConfigWrapper: """Create a GiglResourceConfig with a trainer TensorBoard resource.""" @@ -157,12 +154,43 @@ def _create_resource_config_with_trainer_tensorboard( graph_store_config.compute_pool.tensorboard_resource_name = ( tensorboard_resource_name ) + graph_store_config.compute_pool.tensorboard_experiment_name = ( + tensorboard_experiment_name + ) config.trainer_resource_config.vertex_ai_graph_store_trainer_config.CopyFrom( graph_store_config ) else: vertex_ai_resource_config = _create_vertex_ai_resource_config() vertex_ai_resource_config.tensorboard_resource_name = tensorboard_resource_name + vertex_ai_resource_config.tensorboard_experiment_name = ( + tensorboard_experiment_name + ) + config.trainer_resource_config.vertex_ai_trainer_config.CopyFrom( + vertex_ai_resource_config + ) + + return GiglResourceConfigWrapper(resource_config=config) + + +def _create_resource_config_with_experiment_name_only( + *, + experiment_name: str, + use_graph_store: bool = False, +) -> GiglResourceConfigWrapper: + """Create a GiglResourceConfig with experiment_name set but NO TB resource.""" + config = gigl_resource_config_pb2.GiglResourceConfig() + _create_shared_resource_config(config) + + if use_graph_store: + graph_store_config = _create_vertex_ai_graph_store_config() + graph_store_config.compute_pool.tensorboard_experiment_name = experiment_name + config.trainer_resource_config.vertex_ai_graph_store_trainer_config.CopyFrom( + graph_store_config + ) + else: + vertex_ai_resource_config = _create_vertex_ai_resource_config() + vertex_ai_resource_config.tensorboard_experiment_name = experiment_name config.trainer_resource_config.vertex_ai_trainer_config.CopyFrom( vertex_ai_resource_config ) @@ -300,10 +328,10 @@ def test_resource_has_inferencer_graph_store_template_does_not(self): def test_experiment_name_set_without_tensorboard_resource_raises(self): """tensorboard_experiment_name set but no TB resource → AssertionError mentioning the field.""" - gbml_config = _create_gbml_config_with_tensorboard_experiment_name( + gbml_config = _create_empty_gbml_config() + resource_config = _create_resource_config_with_experiment_name_only( experiment_name="my-comparison" ) - resource_config = _create_resource_config_without_graph_stores() with self.assertRaises(AssertionError) as ctx: check_vertex_ai_trainer_tensorboard_compatibility( @@ -314,13 +342,12 @@ def test_experiment_name_set_without_tensorboard_resource_raises(self): def test_experiment_name_set_with_tensorboard_resource_does_not_raise(self): """tensorboard_experiment_name set and TB resource present → no exception.""" - gbml_config = _create_gbml_config_with_tensorboard_experiment_name( - experiment_name="my-comparison" - ) + gbml_config = _create_empty_gbml_config() resource_config = _create_resource_config_with_trainer_tensorboard( tensorboard_resource_name=( "projects/test-project/locations/us-central1/tensorboards/test" - ) + ), + tensorboard_experiment_name="my-comparison", ) check_vertex_ai_trainer_tensorboard_compatibility( @@ -332,13 +359,12 @@ def test_experiment_name_set_with_graph_store_tensorboard_resource_does_not_rais self, ): """tensorboard_experiment_name set and graph-store TB resource present → no exception.""" - gbml_config = _create_gbml_config_with_tensorboard_experiment_name( - experiment_name="my-comparison" - ) + gbml_config = _create_empty_gbml_config() resource_config = _create_resource_config_with_trainer_tensorboard( tensorboard_resource_name=( "projects/test-project/locations/us-central1/tensorboards/test" ), + tensorboard_experiment_name="my-comparison", use_graph_store=True, ) From 236275efd5ae3f84832809be99bfe2029491e399 Mon Sep 17 00:00:00 2001 From: kmontemayor Date: Tue, 5 May 2026 23:33:24 +0000 Subject: [PATCH 49/59] examples: update stale CORA task-config comment to point at the new proto After moving tensorboard_experiment_name to VertexAiResourceConfig the comment was still pointing users at the old TrainerConfig location and gbml_config.proto. --- .../configs/e2e_hom_cora_sup_task_config.yaml | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/examples/link_prediction/configs/e2e_hom_cora_sup_task_config.yaml b/examples/link_prediction/configs/e2e_hom_cora_sup_task_config.yaml index e8716ae85..7cdf22b03 100644 --- a/examples/link_prediction/configs/e2e_hom_cora_sup_task_config.yaml +++ b/examples/link_prediction/configs/e2e_hom_cora_sup_task_config.yaml @@ -18,12 +18,10 @@ trainerConfig: log_every_n_batch: "50" # Frequency in which we log batch information num_neighbors: "[10, 10]" # Fanout per hop, specified as a string representation of a list for the homogeneous use case command: python -m examples.link_prediction.homogeneous_training - # To enable cross-job TensorBoard comparison, override - # ``trainerConfig.tensorboardExperimentName`` in your own task config and - # configure ``GiglResourceConfig.trainerResourceConfig...tensorboardResourceName``. - # Left unset here so the default e2e CORA test stays compatible with - # resource configs that don't include a TensorBoard instance. See - # ``proto/snapchat/research/gbml/gbml_config.proto`` for details. + # To enable cross-job TensorBoard comparison, set + # ``GiglResourceConfig.trainerResourceConfig...tensorboardExperimentName`` + # alongside the ``tensorboardResourceName`` on the same resource config. + # See ``proto/snapchat/research/gbml/gigl_resource_config.proto`` for details. inferencerConfig: inferencerArgs: # Example argument to inferencer From 32a5d24322ca1fc759ddc99d1638295274e2520f Mon Sep 17 00:00:00 2001 From: kmontemayor Date: Tue, 5 May 2026 23:55:04 +0000 Subject: [PATCH 50/59] =?UTF-8?q?launcher:=20revert=20use=5Fcuda=20+=20sto?= =?UTF-8?q?rage=20container=5Furi=20fixes=20=E2=80=94=20separate=20PR?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit These fixes were originally bundled with the TB enablement commit but they're independent of the TB feature. Reverting them here keeps this PR scoped to TB only; the bug-fix lands in a follow-up PR: - compute pool: use_cuda=is_cpu_execution (re-introduces the inverted flag, matches main) - graph-store storage pool: tracks compute pool's container_uri / use_cuda instead of always running CPU - rename is_compute_cpu_execution back to is_cpu_execution The corresponding test assertions (--use_cuda presence, storage container_uri) are dropped since they verified the fixed behavior. Co-Authored-By: Claude Opus 4.7 (1M context) --- gigl/src/common/vertex_ai_launcher.py | 21 ++++++++----------- .../src/common/vertex_ai_launcher_test.py | 4 ---- 2 files changed, 9 insertions(+), 16 deletions(-) diff --git a/gigl/src/common/vertex_ai_launcher.py b/gigl/src/common/vertex_ai_launcher.py index 955b6b4ed..944a41a6d 100644 --- a/gigl/src/common/vertex_ai_launcher.py +++ b/gigl/src/common/vertex_ai_launcher.py @@ -130,7 +130,7 @@ def launch_single_pool_job( resource_config_uri=resource_config_uri, command_str=process_command, args=process_runtime_args, - use_cuda=not is_cpu_execution, + use_cuda=is_cpu_execution, container_uri=container_uri, vertex_ai_resource_config=vertex_ai_resource_config, env_vars=[env_var.EnvVar(name="TF_CPP_MIN_LOG_LEVEL", value="3")], @@ -193,16 +193,13 @@ def launch_graph_store_enabled_job( storage_pool_config = vertex_ai_graph_store_config.graph_store_pool compute_pool_config = vertex_ai_graph_store_config.compute_pool - # Compute workers may use GPUs, but storage workers always run the CPU - # graph-store entrypoint. - is_compute_cpu_execution = _determine_if_cpu_execution( + # Determine if CPU or GPU based on compute pool + is_cpu_execution = _determine_if_cpu_execution( vertex_ai_resource_config=compute_pool_config ) cpu_docker_uri = cpu_docker_uri or DEFAULT_GIGL_RELEASE_SRC_IMAGE_CPU cuda_docker_uri = cuda_docker_uri or DEFAULT_GIGL_RELEASE_SRC_IMAGE_CUDA - compute_container_uri = ( - cpu_docker_uri if is_compute_cpu_execution else cuda_docker_uri - ) + container_uri = cpu_docker_uri if is_cpu_execution else cuda_docker_uri logger.info(f"Running {component.value} with command: {compute_commmand}") @@ -210,7 +207,7 @@ def launch_graph_store_enabled_job( vertex_ai_graph_store_config.compute_cluster_local_world_size ) if not num_compute_processes: - if is_compute_cpu_execution: + if is_cpu_execution: num_compute_processes = 1 else: num_compute_processes = vertex_ai_graph_store_config.compute_pool.gpu_limit @@ -233,8 +230,8 @@ def launch_graph_store_enabled_job( resource_config_uri=resource_config_uri, command_str=compute_commmand, args=compute_runtime_args, - use_cuda=not is_compute_cpu_execution, - container_uri=compute_container_uri, + use_cuda=is_cpu_execution, + container_uri=container_uri, vertex_ai_resource_config=compute_pool_config, env_vars=environment_variables, labels=labels, @@ -248,8 +245,8 @@ def launch_graph_store_enabled_job( resource_config_uri=resource_config_uri, command_str=storage_command, args=storage_args, - use_cuda=False, - container_uri=cpu_docker_uri, + use_cuda=is_cpu_execution, + container_uri=container_uri, vertex_ai_resource_config=storage_pool_config, env_vars=environment_variables, labels=labels, diff --git a/tests/unit/src/common/vertex_ai_launcher_test.py b/tests/unit/src/common/vertex_ai_launcher_test.py index 6a54f9027..0a566b4c3 100644 --- a/tests/unit/src/common/vertex_ai_launcher_test.py +++ b/tests/unit/src/common/vertex_ai_launcher_test.py @@ -197,7 +197,6 @@ def test_launch_training_graph_store_cuda(self, mock_vertex_ai_service_class): self.assertIn( f"--epochs={process_runtime_args['epochs']}", compute_job_config.args ) - self.assertIn("--use_cuda", compute_job_config.args) self.assertEqual( compute_job_config.base_output_dir, "gs://test-perm-bucket/job-name/trainer", @@ -209,14 +208,12 @@ def test_launch_training_graph_store_cuda(self, mock_vertex_ai_service_class): # Verify storage pool config self.assertEqual(storage_job_config.machine_type, storage_pool.machine_type) - self.assertEqual(storage_job_config.container_uri, cpu_docker_uri) self.assertIn( "gigl.distributed.graph_store.storage_main", " ".join(storage_job_config.command), ) self.assertIsNotNone(storage_job_config.args) assert storage_job_config.args is not None # Type narrowing for mypy - self.assertNotIn("--use_cuda", storage_job_config.args) self.assertIsNone(storage_job_config.base_output_dir) self.assertIsNone(storage_job_config.tensorboard_resource_name) @@ -324,7 +321,6 @@ def test_launch_inference_single_pool_cpu(self, mock_vertex_ai_service_class): self.assertIn( f"--output_path={process_runtime_args['output_path']}", job_config.args ) - self.assertNotIn("--use_cuda", job_config.args) self.assertIsNone(job_config.base_output_dir) self.assertIsNone(job_config.tensorboard_resource_name) From b17e8d1510760bdebd00c45e9866a77c05f4feb3 Mon Sep 17 00:00:00 2001 From: kmontemayor Date: Tue, 5 May 2026 23:55:12 +0000 Subject: [PATCH 51/59] examples: drop try/finally around training loop, call close() at end MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The try/finally wrapper was meant to guarantee ``tensorboard_writer.close()`` runs even if the training body raised. But Vertex AI is the only relevant runtime here, and on a worker that crashes mid-training the process exits anyway — the paired ``aiplatform.end_upload_tb_log()`` is best-effort regardless. Plain sequential ``writer = from_env(...); ... writer.close()`` matches the rest of the codebase. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../graph_store/heterogeneous_training.py | 456 +++++++++-------- .../graph_store/homogeneous_training.py | 438 ++++++++-------- .../link_prediction/heterogeneous_training.py | 472 +++++++++--------- .../link_prediction/homogeneous_training.py | 464 +++++++++-------- 4 files changed, 903 insertions(+), 927 deletions(-) diff --git a/examples/link_prediction/graph_store/heterogeneous_training.py b/examples/link_prediction/graph_store/heterogeneous_training.py index 5c6019973..1c0e956a0 100644 --- a/examples/link_prediction/graph_store/heterogeneous_training.py +++ b/examples/link_prediction/graph_store/heterogeneous_training.py @@ -463,266 +463,262 @@ def _training_process( is_chief_process = rank == 0 tensorboard_writer = TensorBoardWriter.from_env(enabled=is_chief_process) - try: - loss_fn = RetrievalLoss( - loss=torch.nn.CrossEntropyLoss(reduction="mean"), - temperature=0.07, - remove_accidental_hits=True, - ) - batch_idx = 0 + loss_fn = RetrievalLoss( + loss=torch.nn.CrossEntropyLoss(reduction="mean"), + temperature=0.07, + remove_accidental_hits=True, + ) + batch_idx = 0 - if not args.should_skip_training: - train_main_loader, train_random_negative_loader = _setup_dataloaders( - dataset=dataset, - split="train", - cluster_info=args.cluster_info, - supervision_edge_type=args.supervision_edge_type, - num_neighbors=args.num_neighbors, - sampling_workers_per_process=args.sampling_workers_per_process, - main_batch_size=args.main_batch_size, - random_batch_size=args.random_batch_size, - device=device, - sampling_worker_shared_channel_size=args.sampling_worker_shared_channel_size, - process_start_gap_seconds=args.process_start_gap_seconds, - ) + if not args.should_skip_training: + train_main_loader, train_random_negative_loader = _setup_dataloaders( + dataset=dataset, + split="train", + cluster_info=args.cluster_info, + supervision_edge_type=args.supervision_edge_type, + num_neighbors=args.num_neighbors, + sampling_workers_per_process=args.sampling_workers_per_process, + main_batch_size=args.main_batch_size, + random_batch_size=args.random_batch_size, + device=device, + sampling_worker_shared_channel_size=args.sampling_worker_shared_channel_size, + process_start_gap_seconds=args.process_start_gap_seconds, + ) - train_main_loader_iter = InfiniteIterator(train_main_loader) - train_random_negative_loader_iter = InfiniteIterator( - train_random_negative_loader - ) + train_main_loader_iter = InfiniteIterator(train_main_loader) + train_random_negative_loader_iter = InfiniteIterator( + train_random_negative_loader + ) - val_main_loader, val_random_negative_loader = _setup_dataloaders( - dataset=dataset, - split="val", - cluster_info=args.cluster_info, - supervision_edge_type=args.supervision_edge_type, - num_neighbors=args.num_neighbors, - sampling_workers_per_process=args.sampling_workers_per_process, - main_batch_size=args.main_batch_size, - random_batch_size=args.random_batch_size, - device=device, - sampling_worker_shared_channel_size=args.sampling_worker_shared_channel_size, - process_start_gap_seconds=args.process_start_gap_seconds, - ) + val_main_loader, val_random_negative_loader = _setup_dataloaders( + dataset=dataset, + split="val", + cluster_info=args.cluster_info, + supervision_edge_type=args.supervision_edge_type, + num_neighbors=args.num_neighbors, + sampling_workers_per_process=args.sampling_workers_per_process, + main_batch_size=args.main_batch_size, + random_batch_size=args.random_batch_size, + device=device, + sampling_worker_shared_channel_size=args.sampling_worker_shared_channel_size, + process_start_gap_seconds=args.process_start_gap_seconds, + ) - val_main_loader_iter = InfiniteIterator(val_main_loader) - val_random_negative_loader_iter = InfiniteIterator( - val_random_negative_loader - ) + val_main_loader_iter = InfiniteIterator(val_main_loader) + val_random_negative_loader_iter = InfiniteIterator(val_random_negative_loader) - model = init_example_gigl_heterogeneous_model( - node_type_to_feature_dim=args.node_type_to_feature_dim, - edge_type_to_feature_dim=args.edge_type_to_feature_dim, - hid_dim=args.hid_dim, - out_dim=args.out_dim, - device=device, - wrap_with_ddp=True, - find_unused_encoder_parameters=True, - ) - optimizer = torch.optim.AdamW( - params=model.parameters(), - lr=args.learning_rate, - weight_decay=args.weight_decay, - ) - print(f"Model initialized on rank {rank} training device {device}\n{model}") - flush() + model = init_example_gigl_heterogeneous_model( + node_type_to_feature_dim=args.node_type_to_feature_dim, + edge_type_to_feature_dim=args.edge_type_to_feature_dim, + hid_dim=args.hid_dim, + out_dim=args.out_dim, + device=device, + wrap_with_ddp=True, + find_unused_encoder_parameters=True, + ) + optimizer = torch.optim.AdamW( + params=model.parameters(), + lr=args.learning_rate, + weight_decay=args.weight_decay, + ) + print(f"Model initialized on rank {rank} training device {device}\n{model}") + flush() - # We add a barrier to wait for all processes to finish preparing the dataloader and initializing the model - torch.distributed.barrier() + # We add a barrier to wait for all processes to finish preparing the dataloader and initializing the model + torch.distributed.barrier() - # Entering the training loop - training_start_time = time.time() - avg_train_loss = 0.0 - last_n_batch_avg_loss: list[float] = [] - last_n_batch_time: list[float] = [] - num_max_train_batches_per_process = args.num_max_train_batches // world_size - num_val_batches_per_process = args.num_val_batches // world_size - print( - f"num_max_train_batches_per_process is set to {num_max_train_batches_per_process}" - ) + # Entering the training loop + training_start_time = time.time() + avg_train_loss = 0.0 + last_n_batch_avg_loss: list[float] = [] + last_n_batch_time: list[float] = [] + num_max_train_batches_per_process = args.num_max_train_batches // world_size + num_val_batches_per_process = args.num_val_batches // world_size + print( + f"num_max_train_batches_per_process is set to {num_max_train_batches_per_process}" + ) - model.train() + model.train() + batch_start = time.time() + for main_data, random_data in zip( + train_main_loader_iter, train_random_negative_loader_iter + ): + if batch_idx >= num_max_train_batches_per_process: + print( + f"num_max_train_batches_per_process={num_max_train_batches_per_process} reached, " + f"stopping training on machine {args.cluster_info.compute_node_rank} local rank {local_rank}" + ) + break + loss = _compute_loss( + model=model, + main_data=main_data, + random_negative_data=random_data, + loss_fn=loss_fn, + supervision_edge_type=args.supervision_edge_type, + edge_dir=dataset.fetch_edge_dir(), + device=device, + ) + optimizer.zero_grad() + loss.backward() + optimizer.step() + avg_train_loss = _sync_metric_across_processes(metric=loss) + last_n_batch_avg_loss.append(avg_train_loss) + last_n_batch_time.append(time.time() - batch_start) batch_start = time.time() - for main_data, random_data in zip( - train_main_loader_iter, train_random_negative_loader_iter - ): - if batch_idx >= num_max_train_batches_per_process: - print( - f"num_max_train_batches_per_process={num_max_train_batches_per_process} reached, " - f"stopping training on machine {args.cluster_info.compute_node_rank} local rank {local_rank}" - ) - break - loss = _compute_loss( + batch_idx += 1 + if ( + batch_idx % args.log_every_n_batch == 0 or batch_idx < 10 + ): # Log the first 10 batches to ensure the model is initialized correctly + mean_batch_time = statistics.mean(last_n_batch_time) + mean_train_loss = statistics.mean(last_n_batch_avg_loss) + print( + f"rank={rank}, batch={batch_idx}, latest local train_loss={loss:.6f}" + ) + if torch.cuda.is_available(): + torch.cuda.synchronize() + print( + f"rank={rank}, batch={batch_idx}, mean(batch_time)={mean_batch_time:.3f} sec, max(batch_time)={max(last_n_batch_time):.3f} sec, min(batch_time)={min(last_n_batch_time):.3f} sec" + ) + tensorboard_writer.log( + { + "Time/batch_mean_sec": mean_batch_time, + "Loss/train": mean_train_loss, + }, + step=batch_idx, + ) + last_n_batch_time.clear() + # log the global average training loss + print( + f"rank={rank}, latest avg_train_loss={avg_train_loss:.6f}, last {args.log_every_n_batch} mean(avg_train_loss)={mean_train_loss:.6f}" + ) + last_n_batch_avg_loss.clear() + flush() + + if batch_idx % args.val_every_n_batch == 0: + print(f"rank={rank}, batch={batch_idx}, validating...") + model.eval() + global_avg_val_loss = _run_validation_loops( model=model, - main_data=main_data, - random_negative_data=random_data, + main_loader=val_main_loader_iter, + random_negative_loader=val_random_negative_loader_iter, loss_fn=loss_fn, supervision_edge_type=args.supervision_edge_type, edge_dir=dataset.fetch_edge_dir(), device=device, + log_every_n_batch=args.log_every_n_batch, + num_batches=num_val_batches_per_process, ) - optimizer.zero_grad() - loss.backward() - optimizer.step() - avg_train_loss = _sync_metric_across_processes(metric=loss) - last_n_batch_avg_loss.append(avg_train_loss) - last_n_batch_time.append(time.time() - batch_start) - batch_start = time.time() - batch_idx += 1 - if ( - batch_idx % args.log_every_n_batch == 0 or batch_idx < 10 - ): # Log the first 10 batches to ensure the model is initialized correctly - mean_batch_time = statistics.mean(last_n_batch_time) - mean_train_loss = statistics.mean(last_n_batch_avg_loss) - print( - f"rank={rank}, batch={batch_idx}, latest local train_loss={loss:.6f}" - ) - if torch.cuda.is_available(): - torch.cuda.synchronize() - print( - f"rank={rank}, batch={batch_idx}, mean(batch_time)={mean_batch_time:.3f} sec, max(batch_time)={max(last_n_batch_time):.3f} sec, min(batch_time)={min(last_n_batch_time):.3f} sec" - ) - tensorboard_writer.log( - { - "Time/batch_mean_sec": mean_batch_time, - "Loss/train": mean_train_loss, - }, - step=batch_idx, - ) - last_n_batch_time.clear() - # log the global average training loss - print( - f"rank={rank}, latest avg_train_loss={avg_train_loss:.6f}, last {args.log_every_n_batch} mean(avg_train_loss)={mean_train_loss:.6f}" - ) - last_n_batch_avg_loss.clear() - flush() - - if batch_idx % args.val_every_n_batch == 0: - print(f"rank={rank}, batch={batch_idx}, validating...") - model.eval() - global_avg_val_loss = _run_validation_loops( - model=model, - main_loader=val_main_loader_iter, - random_negative_loader=val_random_negative_loader_iter, - loss_fn=loss_fn, - supervision_edge_type=args.supervision_edge_type, - edge_dir=dataset.fetch_edge_dir(), - device=device, - log_every_n_batch=args.log_every_n_batch, - num_batches=num_val_batches_per_process, - ) - tensorboard_writer.log( - {"Loss/val": global_avg_val_loss}, step=batch_idx - ) - model.train() - else: - print(f"rank={rank} ended training early - no break condition was met") - print(f"---Rank {rank} finished training") - flush() - - # Memory cleanup and waiting for all processes to finish - if torch.cuda.is_available(): - torch.cuda.empty_cache() - torch.cuda.synchronize() - torch.distributed.barrier() + tensorboard_writer.log( + {"Loss/val": global_avg_val_loss}, step=batch_idx + ) + model.train() + else: + print(f"rank={rank} ended training early - no break condition was met") + print(f"---Rank {rank} finished training") + flush() - # We explicitly shutdown all the dataloaders to reduce their memory footprint. - train_main_loader.shutdown() - train_random_negative_loader.shutdown() - val_main_loader.shutdown() - val_random_negative_loader.shutdown() + # Memory cleanup and waiting for all processes to finish + if torch.cuda.is_available(): + torch.cuda.empty_cache() + torch.cuda.synchronize() + torch.distributed.barrier() - # We save the model on the process with rank 0. - if torch.distributed.get_rank() == 0: - print( - f"Training loop finished, took {time.time() - training_start_time:.3f} seconds, saving model to {args.model_uri}" - ) - save_state_dict( - model=model.unwrap_from_ddp(), save_to_path_uri=args.model_uri - ) - flush() + # We explicitly shutdown all the dataloaders to reduce their memory footprint. + train_main_loader.shutdown() + train_random_negative_loader.shutdown() + val_main_loader.shutdown() + val_random_negative_loader.shutdown() - else: # should_skip_training is True, meaning we should only run testing - state_dict = load_state_dict_from_uri( - load_from_uri=args.model_uri, device=device + # We save the model on the process with rank 0. + if torch.distributed.get_rank() == 0: + print( + f"Training loop finished, took {time.time() - training_start_time:.3f} seconds, saving model to {args.model_uri}" ) - model = init_example_gigl_heterogeneous_model( - node_type_to_feature_dim=args.node_type_to_feature_dim, - edge_type_to_feature_dim=args.edge_type_to_feature_dim, - hid_dim=args.hid_dim, - out_dim=args.out_dim, - device=device, - wrap_with_ddp=True, - find_unused_encoder_parameters=True, - state_dict=state_dict, + save_state_dict( + model=model.unwrap_from_ddp(), save_to_path_uri=args.model_uri ) - print(f"Model initialized on rank {rank} training device {device}\n{model}") - - print(f"---Rank {rank} started testing") - flush() - testing_start_time = time.time() - - model.eval() + flush() - test_main_loader, test_random_negative_loader = _setup_dataloaders( - dataset=dataset, - split="test", - cluster_info=args.cluster_info, - supervision_edge_type=args.supervision_edge_type, - num_neighbors=args.num_neighbors, - sampling_workers_per_process=args.sampling_workers_per_process, - main_batch_size=args.main_batch_size, - random_batch_size=args.random_batch_size, + else: # should_skip_training is True, meaning we should only run testing + state_dict = load_state_dict_from_uri( + load_from_uri=args.model_uri, device=device + ) + model = init_example_gigl_heterogeneous_model( + node_type_to_feature_dim=args.node_type_to_feature_dim, + edge_type_to_feature_dim=args.edge_type_to_feature_dim, + hid_dim=args.hid_dim, + out_dim=args.out_dim, device=device, - sampling_worker_shared_channel_size=args.sampling_worker_shared_channel_size, - process_start_gap_seconds=args.process_start_gap_seconds, + wrap_with_ddp=True, + find_unused_encoder_parameters=True, + state_dict=state_dict, ) + print(f"Model initialized on rank {rank} training device {device}\n{model}") - # Since we are doing testing, we only want to go through the data once. - test_main_loader_iter = iter(test_main_loader) - test_random_negative_loader_iter = iter(test_random_negative_loader) + print(f"---Rank {rank} started testing") + flush() + testing_start_time = time.time() - global_avg_test_loss = _run_validation_loops( - model=model, - main_loader=test_main_loader_iter, - random_negative_loader=test_random_negative_loader_iter, - loss_fn=loss_fn, - supervision_edge_type=args.supervision_edge_type, - edge_dir=dataset.fetch_edge_dir(), - device=device, - log_every_n_batch=args.log_every_n_batch, - ) - tensorboard_writer.log({"Loss/test": global_avg_test_loss}, step=batch_idx) + model.eval() - # Memory cleanup and waiting for all processes to finish - if torch.cuda.is_available(): - torch.cuda.empty_cache() - torch.cuda.synchronize() - torch.distributed.barrier() + test_main_loader, test_random_negative_loader = _setup_dataloaders( + dataset=dataset, + split="test", + cluster_info=args.cluster_info, + supervision_edge_type=args.supervision_edge_type, + num_neighbors=args.num_neighbors, + sampling_workers_per_process=args.sampling_workers_per_process, + main_batch_size=args.main_batch_size, + random_batch_size=args.random_batch_size, + device=device, + sampling_worker_shared_channel_size=args.sampling_worker_shared_channel_size, + process_start_gap_seconds=args.process_start_gap_seconds, + ) - test_main_loader.shutdown() - test_random_negative_loader.shutdown() - - # Write eval metrics on the lead process only - if torch.distributed.get_rank() == 0 and args.eval_metrics_uri is not None: - eval_metrics = EvalMetricsCollection( - metrics=[ - EvalMetric.from_eval_metric_type( - EvalMetricType.loss, global_avg_test_loss - ) - ] - ) - write_eval_metrics_to_uri( - eval_metrics=eval_metrics, eval_metrics_uri=args.eval_metrics_uri - ) + # Since we are doing testing, we only want to go through the data once. + test_main_loader_iter = iter(test_main_loader) + test_random_negative_loader_iter = iter(test_random_negative_loader) - print( - f"---Rank {rank} finished testing in {time.time() - testing_start_time:.3f} seconds" + global_avg_test_loss = _run_validation_loops( + model=model, + main_loader=test_main_loader_iter, + random_negative_loader=test_random_negative_loader_iter, + loss_fn=loss_fn, + supervision_edge_type=args.supervision_edge_type, + edge_dir=dataset.fetch_edge_dir(), + device=device, + log_every_n_batch=args.log_every_n_batch, + ) + tensorboard_writer.log({"Loss/test": global_avg_test_loss}, step=batch_idx) + + # Memory cleanup and waiting for all processes to finish + if torch.cuda.is_available(): + torch.cuda.empty_cache() + torch.cuda.synchronize() + torch.distributed.barrier() + + test_main_loader.shutdown() + test_random_negative_loader.shutdown() + + # Write eval metrics on the lead process only + if torch.distributed.get_rank() == 0 and args.eval_metrics_uri is not None: + eval_metrics = EvalMetricsCollection( + metrics=[ + EvalMetric.from_eval_metric_type( + EvalMetricType.loss, global_avg_test_loss + ) + ] ) - flush() - finally: - tensorboard_writer.close() + write_eval_metrics_to_uri( + eval_metrics=eval_metrics, eval_metrics_uri=args.eval_metrics_uri + ) + + print( + f"---Rank {rank} finished testing in {time.time() - testing_start_time:.3f} seconds" + ) + flush() + tensorboard_writer.close() # Graph store mode cleanup: shutdown the compute process connection to the storage cluster. shutdown_compute_proccess() diff --git a/examples/link_prediction/graph_store/homogeneous_training.py b/examples/link_prediction/graph_store/homogeneous_training.py index e77039fc3..8bc93f535 100644 --- a/examples/link_prediction/graph_store/homogeneous_training.py +++ b/examples/link_prediction/graph_store/homogeneous_training.py @@ -454,229 +454,154 @@ def _training_process( is_chief_process = rank == 0 tensorboard_writer = TensorBoardWriter.from_env(enabled=is_chief_process) - try: - loss_fn = RetrievalLoss( - loss=torch.nn.CrossEntropyLoss(reduction="mean"), - temperature=0.07, - remove_accidental_hits=True, + loss_fn = RetrievalLoss( + loss=torch.nn.CrossEntropyLoss(reduction="mean"), + temperature=0.07, + remove_accidental_hits=True, + ) + batch_idx = 0 + + if not args.should_skip_training: + train_main_loader, train_random_negative_loader = _setup_dataloaders( + dataset=dataset, + split="train", + cluster_info=args.cluster_info, + num_neighbors=args.num_neighbors, + sampling_workers_per_process=args.sampling_workers_per_process, + main_batch_size=args.main_batch_size, + random_batch_size=args.random_batch_size, + device=device, + sampling_worker_shared_channel_size=args.sampling_worker_shared_channel_size, + process_start_gap_seconds=args.process_start_gap_seconds, ) - batch_idx = 0 - - if not args.should_skip_training: - train_main_loader, train_random_negative_loader = _setup_dataloaders( - dataset=dataset, - split="train", - cluster_info=args.cluster_info, - num_neighbors=args.num_neighbors, - sampling_workers_per_process=args.sampling_workers_per_process, - main_batch_size=args.main_batch_size, - random_batch_size=args.random_batch_size, - device=device, - sampling_worker_shared_channel_size=args.sampling_worker_shared_channel_size, - process_start_gap_seconds=args.process_start_gap_seconds, - ) - train_main_loader_iter = InfiniteIterator(train_main_loader) - train_random_negative_loader_iter = InfiniteIterator( - train_random_negative_loader - ) + train_main_loader_iter = InfiniteIterator(train_main_loader) + train_random_negative_loader_iter = InfiniteIterator( + train_random_negative_loader + ) - val_main_loader, val_random_negative_loader = _setup_dataloaders( - dataset=dataset, - split="val", - cluster_info=args.cluster_info, - num_neighbors=args.num_neighbors, - sampling_workers_per_process=args.sampling_workers_per_process, - main_batch_size=args.main_batch_size, - random_batch_size=args.random_batch_size, - device=device, - sampling_worker_shared_channel_size=args.sampling_worker_shared_channel_size, - process_start_gap_seconds=args.process_start_gap_seconds, - ) + val_main_loader, val_random_negative_loader = _setup_dataloaders( + dataset=dataset, + split="val", + cluster_info=args.cluster_info, + num_neighbors=args.num_neighbors, + sampling_workers_per_process=args.sampling_workers_per_process, + main_batch_size=args.main_batch_size, + random_batch_size=args.random_batch_size, + device=device, + sampling_worker_shared_channel_size=args.sampling_worker_shared_channel_size, + process_start_gap_seconds=args.process_start_gap_seconds, + ) - val_main_loader_iter = InfiniteIterator(val_main_loader) - val_random_negative_loader_iter = InfiniteIterator( - val_random_negative_loader - ) + val_main_loader_iter = InfiniteIterator(val_main_loader) + val_random_negative_loader_iter = InfiniteIterator(val_random_negative_loader) - model = init_example_gigl_homogeneous_model( - node_feature_dim=args.node_feature_dim, - edge_feature_dim=args.edge_feature_dim, - hid_dim=args.hid_dim, - out_dim=args.out_dim, - device=device, - wrap_with_ddp=True, - find_unused_encoder_parameters=True, - ) + model = init_example_gigl_homogeneous_model( + node_feature_dim=args.node_feature_dim, + edge_feature_dim=args.edge_feature_dim, + hid_dim=args.hid_dim, + out_dim=args.out_dim, + device=device, + wrap_with_ddp=True, + find_unused_encoder_parameters=True, + ) - optimizer = torch.optim.AdamW( - params=model.parameters(), - lr=args.learning_rate, - weight_decay=args.weight_decay, - ) - logger.info( - f"Model initialized on rank {rank} training device {device}\n{model}" - ) - flush() + optimizer = torch.optim.AdamW( + params=model.parameters(), + lr=args.learning_rate, + weight_decay=args.weight_decay, + ) + logger.info( + f"Model initialized on rank {rank} training device {device}\n{model}" + ) + flush() - # We add a barrier to wait for all processes to finish preparing the dataloader and initializing the model - torch.distributed.barrier() + # We add a barrier to wait for all processes to finish preparing the dataloader and initializing the model + torch.distributed.barrier() - # Entering the training loop - training_start_time = time.time() - avg_train_loss = 0.0 - last_n_batch_avg_loss: list[float] = [] - last_n_batch_time: list[float] = [] - num_max_train_batches_per_process = args.num_max_train_batches // world_size - num_val_batches_per_process = args.num_val_batches // world_size - logger.info( - f"num_max_train_batches_per_process is set to {num_max_train_batches_per_process}" - ) + # Entering the training loop + training_start_time = time.time() + avg_train_loss = 0.0 + last_n_batch_avg_loss: list[float] = [] + last_n_batch_time: list[float] = [] + num_max_train_batches_per_process = args.num_max_train_batches // world_size + num_val_batches_per_process = args.num_val_batches // world_size + logger.info( + f"num_max_train_batches_per_process is set to {num_max_train_batches_per_process}" + ) - model.train() + model.train() + batch_start = time.time() + for main_data, random_data in zip( + train_main_loader_iter, train_random_negative_loader_iter + ): + if batch_idx >= num_max_train_batches_per_process: + logger.info( + f"num_max_train_batches_per_process={num_max_train_batches_per_process} reached, " + f"stopping training on machine {args.cluster_info.compute_node_rank} local rank {local_rank}" + ) + break + loss = _compute_loss( + model=model, + main_data=main_data, + random_negative_data=random_data, + loss_fn=loss_fn, + device=device, + ) + optimizer.zero_grad() + loss.backward() + optimizer.step() + avg_train_loss = _sync_metric_across_processes(metric=loss) + last_n_batch_avg_loss.append(avg_train_loss) + last_n_batch_time.append(time.time() - batch_start) batch_start = time.time() - for main_data, random_data in zip( - train_main_loader_iter, train_random_negative_loader_iter - ): - if batch_idx >= num_max_train_batches_per_process: - logger.info( - f"num_max_train_batches_per_process={num_max_train_batches_per_process} reached, " - f"stopping training on machine {args.cluster_info.compute_node_rank} local rank {local_rank}" - ) - break - loss = _compute_loss( - model=model, - main_data=main_data, - random_negative_data=random_data, - loss_fn=loss_fn, - device=device, + batch_idx += 1 + if batch_idx % args.log_every_n_batch == 0: + mean_batch_time = statistics.mean(last_n_batch_time) + mean_train_loss = statistics.mean(last_n_batch_avg_loss) + logger.info( + f"rank={rank}, batch={batch_idx}, latest local train_loss={loss:.6f}" ) - optimizer.zero_grad() - loss.backward() - optimizer.step() - avg_train_loss = _sync_metric_across_processes(metric=loss) - last_n_batch_avg_loss.append(avg_train_loss) - last_n_batch_time.append(time.time() - batch_start) - batch_start = time.time() - batch_idx += 1 - if batch_idx % args.log_every_n_batch == 0: - mean_batch_time = statistics.mean(last_n_batch_time) - mean_train_loss = statistics.mean(last_n_batch_avg_loss) - logger.info( - f"rank={rank}, batch={batch_idx}, latest local train_loss={loss:.6f}" - ) - if torch.cuda.is_available(): - torch.cuda.synchronize() - logger.info( - f"rank={rank}, mean(batch_time)={mean_batch_time:.3f} sec, max(batch_time)={max(last_n_batch_time):.3f} sec, min(batch_time)={min(last_n_batch_time):.3f} sec" - ) - tensorboard_writer.log( - { - "Time/batch_mean_sec": mean_batch_time, - "Loss/train": mean_train_loss, - }, - step=batch_idx, - ) - last_n_batch_time.clear() - # log the global average training loss - logger.info( - f"rank={rank}, latest avg_train_loss={avg_train_loss:.6f}, last {args.log_every_n_batch} mean(avg_train_loss)={mean_train_loss:.6f}" - ) - last_n_batch_avg_loss.clear() - flush() - - if batch_idx % args.val_every_n_batch == 0: - logger.info(f"rank={rank}, batch={batch_idx}, validating...") - model.eval() - global_avg_val_loss = _run_validation_loops( - model=model, - main_loader=val_main_loader_iter, - random_negative_loader=val_random_negative_loader_iter, - loss_fn=loss_fn, - device=device, - log_every_n_batch=args.log_every_n_batch, - num_batches=num_val_batches_per_process, - ) - tensorboard_writer.log( - {"Loss/val": global_avg_val_loss}, step=batch_idx - ) - model.train() - - logger.info(f"---Rank {rank} finished training") - flush() - - # Memory cleanup and waiting for all processes to finish - if torch.cuda.is_available(): - torch.cuda.empty_cache() - torch.cuda.synchronize() - torch.distributed.barrier() - - # We explicitly shutdown all the dataloaders to reduce their memory footprint. - train_main_loader.shutdown() - train_random_negative_loader.shutdown() - val_main_loader.shutdown() - val_random_negative_loader.shutdown() - - # We save the model on the process with rank 0. - if torch.distributed.get_rank() == 0: + if torch.cuda.is_available(): + torch.cuda.synchronize() logger.info( - f"Training loop finished, took {time.time() - training_start_time:.3f} seconds, saving model to {args.model_uri}" + f"rank={rank}, mean(batch_time)={mean_batch_time:.3f} sec, max(batch_time)={max(last_n_batch_time):.3f} sec, min(batch_time)={min(last_n_batch_time):.3f} sec" ) - save_state_dict( - model=model.unwrap_from_ddp(), save_to_path_uri=args.model_uri + tensorboard_writer.log( + { + "Time/batch_mean_sec": mean_batch_time, + "Loss/train": mean_train_loss, + }, + step=batch_idx, + ) + last_n_batch_time.clear() + # log the global average training loss + logger.info( + f"rank={rank}, latest avg_train_loss={avg_train_loss:.6f}, last {args.log_every_n_batch} mean(avg_train_loss)={mean_train_loss:.6f}" ) + last_n_batch_avg_loss.clear() flush() - else: # should_skip_training is True, meaning we should only run testing - state_dict = load_state_dict_from_uri( - load_from_uri=args.model_uri, device=device - ) - model = init_example_gigl_homogeneous_model( - node_feature_dim=args.node_feature_dim, - edge_feature_dim=args.edge_feature_dim, - hid_dim=args.hid_dim, - out_dim=args.out_dim, - device=device, - wrap_with_ddp=True, - find_unused_encoder_parameters=True, - state_dict=state_dict, - ) - logger.info( - f"Model initialized on rank {rank} training device {device}\n{model}" - ) + if batch_idx % args.val_every_n_batch == 0: + logger.info(f"rank={rank}, batch={batch_idx}, validating...") + model.eval() + global_avg_val_loss = _run_validation_loops( + model=model, + main_loader=val_main_loader_iter, + random_negative_loader=val_random_negative_loader_iter, + loss_fn=loss_fn, + device=device, + log_every_n_batch=args.log_every_n_batch, + num_batches=num_val_batches_per_process, + ) + tensorboard_writer.log( + {"Loss/val": global_avg_val_loss}, step=batch_idx + ) + model.train() - logger.info(f"---Rank {rank} started testing") + logger.info(f"---Rank {rank} finished training") flush() - testing_start_time = time.time() - model.eval() - - test_main_loader, test_random_negative_loader = _setup_dataloaders( - dataset=dataset, - split="test", - cluster_info=args.cluster_info, - num_neighbors=args.num_neighbors, - sampling_workers_per_process=args.sampling_workers_per_process, - main_batch_size=args.main_batch_size, - random_batch_size=args.random_batch_size, - device=device, - sampling_worker_shared_channel_size=args.sampling_worker_shared_channel_size, - process_start_gap_seconds=args.process_start_gap_seconds, - ) - - # Since we are doing testing, we only want to go through the data once. - test_main_loader_iter = iter(test_main_loader) - test_random_negative_loader_iter = iter(test_random_negative_loader) - - global_avg_test_loss = _run_validation_loops( - model=model, - main_loader=test_main_loader_iter, - random_negative_loader=test_random_negative_loader_iter, - loss_fn=loss_fn, - device=device, - log_every_n_batch=args.log_every_n_batch, - ) - tensorboard_writer.log({"Loss/test": global_avg_test_loss}, step=batch_idx) # Memory cleanup and waiting for all processes to finish if torch.cuda.is_available(): @@ -684,28 +609,99 @@ def _training_process( torch.cuda.synchronize() torch.distributed.barrier() - test_main_loader.shutdown() - test_random_negative_loader.shutdown() - - # Write eval metrics on the lead process only - if torch.distributed.get_rank() == 0 and args.eval_metrics_uri is not None: - eval_metrics = EvalMetricsCollection( - metrics=[ - EvalMetric.from_eval_metric_type( - EvalMetricType.loss, global_avg_test_loss - ) - ] + # We explicitly shutdown all the dataloaders to reduce their memory footprint. + train_main_loader.shutdown() + train_random_negative_loader.shutdown() + val_main_loader.shutdown() + val_random_negative_loader.shutdown() + + # We save the model on the process with rank 0. + if torch.distributed.get_rank() == 0: + logger.info( + f"Training loop finished, took {time.time() - training_start_time:.3f} seconds, saving model to {args.model_uri}" ) - write_eval_metrics_to_uri( - eval_metrics=eval_metrics, eval_metrics_uri=args.eval_metrics_uri + save_state_dict( + model=model.unwrap_from_ddp(), save_to_path_uri=args.model_uri ) + flush() + else: # should_skip_training is True, meaning we should only run testing + state_dict = load_state_dict_from_uri( + load_from_uri=args.model_uri, device=device + ) + model = init_example_gigl_homogeneous_model( + node_feature_dim=args.node_feature_dim, + edge_feature_dim=args.edge_feature_dim, + hid_dim=args.hid_dim, + out_dim=args.out_dim, + device=device, + wrap_with_ddp=True, + find_unused_encoder_parameters=True, + state_dict=state_dict, + ) logger.info( - f"---Rank {rank} finished testing in {time.time() - testing_start_time:.3f} seconds" + f"Model initialized on rank {rank} training device {device}\n{model}" ) - flush() - finally: - tensorboard_writer.close() + + logger.info(f"---Rank {rank} started testing") + flush() + testing_start_time = time.time() + model.eval() + + test_main_loader, test_random_negative_loader = _setup_dataloaders( + dataset=dataset, + split="test", + cluster_info=args.cluster_info, + num_neighbors=args.num_neighbors, + sampling_workers_per_process=args.sampling_workers_per_process, + main_batch_size=args.main_batch_size, + random_batch_size=args.random_batch_size, + device=device, + sampling_worker_shared_channel_size=args.sampling_worker_shared_channel_size, + process_start_gap_seconds=args.process_start_gap_seconds, + ) + + # Since we are doing testing, we only want to go through the data once. + test_main_loader_iter = iter(test_main_loader) + test_random_negative_loader_iter = iter(test_random_negative_loader) + + global_avg_test_loss = _run_validation_loops( + model=model, + main_loader=test_main_loader_iter, + random_negative_loader=test_random_negative_loader_iter, + loss_fn=loss_fn, + device=device, + log_every_n_batch=args.log_every_n_batch, + ) + tensorboard_writer.log({"Loss/test": global_avg_test_loss}, step=batch_idx) + + # Memory cleanup and waiting for all processes to finish + if torch.cuda.is_available(): + torch.cuda.empty_cache() + torch.cuda.synchronize() + torch.distributed.barrier() + + test_main_loader.shutdown() + test_random_negative_loader.shutdown() + + # Write eval metrics on the lead process only + if torch.distributed.get_rank() == 0 and args.eval_metrics_uri is not None: + eval_metrics = EvalMetricsCollection( + metrics=[ + EvalMetric.from_eval_metric_type( + EvalMetricType.loss, global_avg_test_loss + ) + ] + ) + write_eval_metrics_to_uri( + eval_metrics=eval_metrics, eval_metrics_uri=args.eval_metrics_uri + ) + + logger.info( + f"---Rank {rank} finished testing in {time.time() - testing_start_time:.3f} seconds" + ) + flush() + tensorboard_writer.close() # Graph store mode cleanup: shutdown the compute process connection to the storage cluster. shutdown_compute_proccess() diff --git a/examples/link_prediction/heterogeneous_training.py b/examples/link_prediction/heterogeneous_training.py index 23b7f0f17..e8cf68c1d 100644 --- a/examples/link_prediction/heterogeneous_training.py +++ b/examples/link_prediction/heterogeneous_training.py @@ -404,215 +404,17 @@ def _training_process( is_chief_process = args.machine_rank == 0 and local_rank == 0 tensorboard_writer = TensorBoardWriter.from_env(enabled=is_chief_process) - try: - loss_fn = RetrievalLoss( - loss=torch.nn.CrossEntropyLoss(reduction="mean"), - temperature=0.07, - remove_accidental_hits=True, - ) - batch_idx = 0 - - if not args.should_skip_training: - train_main_loader, train_random_negative_loader = _setup_dataloaders( - dataset=args.dataset, - split="train", - supervision_edge_type=args.supervision_edge_type, - num_neighbors=args.num_neighbors, - sampling_workers_per_process=args.sampling_workers_per_process, - main_batch_size=args.main_batch_size, - random_batch_size=args.random_batch_size, - device=device, - sampling_worker_shared_channel_size=args.sampling_worker_shared_channel_size, - process_start_gap_seconds=args.process_start_gap_seconds, - ) - - # We keep track of both the dataloader and the iterator for it - # so we can clean up resources from the dataloader later. - train_main_loader_iter = InfiniteIterator(train_main_loader) - train_random_negative_loader_iter = InfiniteIterator( - train_random_negative_loader - ) - - val_main_loader, val_random_negative_loader = _setup_dataloaders( - dataset=args.dataset, - split="val", - supervision_edge_type=args.supervision_edge_type, - num_neighbors=args.num_neighbors, - sampling_workers_per_process=args.sampling_workers_per_process, - main_batch_size=args.main_batch_size, - random_batch_size=args.random_batch_size, - device=device, - sampling_worker_shared_channel_size=args.sampling_worker_shared_channel_size, - process_start_gap_seconds=args.process_start_gap_seconds, - ) - - # We keep track of both the dataloader and the iterator for it - # so we can clean up resources from the dataloader later. - val_main_loader_iter = InfiniteIterator(val_main_loader) - val_random_negative_loader_iter = InfiniteIterator( - val_random_negative_loader - ) - model = init_example_gigl_heterogeneous_model( - node_type_to_feature_dim=args.node_type_to_feature_dim, - edge_type_to_feature_dim=args.edge_type_to_feature_dim, - hid_dim=args.hid_dim, - out_dim=args.out_dim, - device=device, - wrap_with_ddp=True, - # Find unused parameters in the encoder. - # We do this as the encoder model is initialized with all edge types in the graph, but the training task only uses a subset of them. - find_unused_encoder_parameters=True, - ) - optimizer = torch.optim.AdamW( - params=model.parameters(), - lr=args.learning_rate, - weight_decay=args.weight_decay, - ) - logger.info( - f"Model initialized on rank {rank} training device {device}\n{model}" - ) - - # We add a barrier to wait for all processes to finish preparing the dataloader and initializing the model prior to the start of training - torch.distributed.barrier() - - # Entering the training loop - training_start_time = time.time() - avg_train_loss = 0.0 - last_n_batch_avg_loss: list[float] = [] - last_n_batch_time: list[float] = [] - num_max_train_batches_per_process = args.num_max_train_batches // world_size - num_val_batches_per_process = args.num_val_batches // world_size - logger.info( - f"num_max_train_batches_per_process is set to {num_max_train_batches_per_process}" - ) - - model.train() - - # start_time gets updated every log_every_n_batch batches, batch_start gets updated every batch - batch_start = time.time() - for main_data, random_data in zip( - train_main_loader_iter, train_random_negative_loader_iter - ): - if batch_idx >= num_max_train_batches_per_process: - logger.info( - f"num_max_train_batches_per_process={num_max_train_batches_per_process} reached, " - f"stopping training on machine {args.machine_rank} local rank {local_rank}" - ) - break - loss = _compute_loss( - model=model, - main_data=main_data, - random_negative_data=random_data, - loss_fn=loss_fn, - supervision_edge_type=args.supervision_edge_type, - device=device, - ) - optimizer.zero_grad() - loss.backward() - optimizer.step() - avg_train_loss = _sync_metric_across_processes(metric=loss) - last_n_batch_avg_loss.append(avg_train_loss) - last_n_batch_time.append(time.time() - batch_start) - batch_start = time.time() - batch_idx += 1 - if batch_idx % args.log_every_n_batch == 0: - mean_batch_time = statistics.mean(last_n_batch_time) - mean_train_loss = statistics.mean(last_n_batch_avg_loss) - logger.info( - f"rank={rank}, batch={batch_idx}, latest local train_loss={loss:.6f}" - ) - if torch.cuda.is_available(): - # Wait for GPU operations to finish - torch.cuda.synchronize() - logger.info( - f"rank={rank}, batch={batch_idx}, mean(batch_time)={mean_batch_time:.3f} sec, max(batch_time)={max(last_n_batch_time):.3f} sec, min(batch_time)={min(last_n_batch_time):.3f} sec" - ) - tensorboard_writer.log( - { - "Time/batch_mean_sec": mean_batch_time, - "Loss/train": mean_train_loss, - }, - step=batch_idx, - ) - last_n_batch_time.clear() - # log the global average training loss - logger.info( - f"rank={rank}, latest avg_train_loss={avg_train_loss:.6f}, last {args.log_every_n_batch} mean(avg_train_loss)={mean_train_loss:.6f}" - ) - last_n_batch_avg_loss.clear() - - if batch_idx % args.val_every_n_batch == 0: - logger.info(f"rank={rank}, batch={batch_idx}, validating...") - model.eval() - global_avg_val_loss = _run_validation_loops( - model=model, - main_loader=val_main_loader_iter, - random_negative_loader=val_random_negative_loader_iter, - loss_fn=loss_fn, - supervision_edge_type=args.supervision_edge_type, - device=device, - log_every_n_batch=args.log_every_n_batch, - num_batches=num_val_batches_per_process, - ) - tensorboard_writer.log( - {"Loss/val": global_avg_val_loss}, step=batch_idx - ) - model.train() - - logger.info(f"---Rank {rank} finished training") - - # Memory cleanup and waiting for all processes to finish - if torch.cuda.is_available(): - torch.cuda.empty_cache() # Releases all unoccupied cached memory currently held by the caching allocator on the CUDA-enabled GPU - torch.cuda.synchronize() # Ensures all CUDA operations have finished - torch.distributed.barrier() # Waits for all processes to reach the current point - - # We explicitly shutdown all the dataloaders to reduce their memory footprint. Otherwise, experimentally we have - # observed that not all memory may be cleaned up, leading to OOM. - train_main_loader.shutdown() - train_random_negative_loader.shutdown() - val_main_loader.shutdown() - val_random_negative_loader.shutdown() - - # We save the model on the process with the 0th node rank and 0th local rank. - if args.machine_rank == 0 and local_rank == 0: - logger.info( - f"Training loop finished, took {time.time() - training_start_time:.3f} seconds, saving model to {args.model_uri}" - ) - # We unwrap the model from DDP to save it - # We do this so we can use the model without DDP later, e.g. for inference. - save_state_dict( - model=model.unwrap_from_ddp(), save_to_path_uri=args.model_uri - ) - - else: # should_skip_training is True, meaning we should only run testing - state_dict = load_state_dict_from_uri( - load_from_uri=args.model_uri, device=device - ) - model = init_example_gigl_heterogeneous_model( - node_type_to_feature_dim=args.node_type_to_feature_dim, - edge_type_to_feature_dim=args.edge_type_to_feature_dim, - hid_dim=args.hid_dim, - out_dim=args.out_dim, - device=device, - wrap_with_ddp=True, - # Find unused parameters in the encoder. - # We do this as the encoder model is initialized with all edge types in the graph, but the training task only uses a subset of them. - find_unused_encoder_parameters=True, - state_dict=state_dict, # We load the model state dict for testing - ) - logger.info( - f"Model initialized on rank {rank} training device {device}\n{model}" - ) - - logger.info(f"---Rank {rank} started testing") - testing_start_time = time.time() - - model.eval() + loss_fn = RetrievalLoss( + loss=torch.nn.CrossEntropyLoss(reduction="mean"), + temperature=0.07, + remove_accidental_hits=True, + ) + batch_idx = 0 - test_main_loader, test_random_negative_loader = _setup_dataloaders( + if not args.should_skip_training: + train_main_loader, train_random_negative_loader = _setup_dataloaders( dataset=args.dataset, - split="test", + split="train", supervision_edge_type=args.supervision_edge_type, num_neighbors=args.num_neighbors, sampling_workers_per_process=args.sampling_workers_per_process, @@ -625,20 +427,136 @@ def _training_process( # We keep track of both the dataloader and the iterator for it # so we can clean up resources from the dataloader later. - # Since we are doing testing, we only want to go through the data once, so we use iter instead of InfiniteIterator. - test_main_loader_iter = iter(test_main_loader) - test_random_negative_loader_iter = iter(test_random_negative_loader) + train_main_loader_iter = InfiniteIterator(train_main_loader) + train_random_negative_loader_iter = InfiniteIterator( + train_random_negative_loader + ) - global_avg_test_loss = _run_validation_loops( - model=model, - main_loader=test_main_loader_iter, - random_negative_loader=test_random_negative_loader_iter, - loss_fn=loss_fn, + val_main_loader, val_random_negative_loader = _setup_dataloaders( + dataset=args.dataset, + split="val", supervision_edge_type=args.supervision_edge_type, + num_neighbors=args.num_neighbors, + sampling_workers_per_process=args.sampling_workers_per_process, + main_batch_size=args.main_batch_size, + random_batch_size=args.random_batch_size, device=device, - log_every_n_batch=args.log_every_n_batch, + sampling_worker_shared_channel_size=args.sampling_worker_shared_channel_size, + process_start_gap_seconds=args.process_start_gap_seconds, ) - tensorboard_writer.log({"Loss/test": global_avg_test_loss}, step=batch_idx) + + # We keep track of both the dataloader and the iterator for it + # so we can clean up resources from the dataloader later. + val_main_loader_iter = InfiniteIterator(val_main_loader) + val_random_negative_loader_iter = InfiniteIterator(val_random_negative_loader) + model = init_example_gigl_heterogeneous_model( + node_type_to_feature_dim=args.node_type_to_feature_dim, + edge_type_to_feature_dim=args.edge_type_to_feature_dim, + hid_dim=args.hid_dim, + out_dim=args.out_dim, + device=device, + wrap_with_ddp=True, + # Find unused parameters in the encoder. + # We do this as the encoder model is initialized with all edge types in the graph, but the training task only uses a subset of them. + find_unused_encoder_parameters=True, + ) + optimizer = torch.optim.AdamW( + params=model.parameters(), + lr=args.learning_rate, + weight_decay=args.weight_decay, + ) + logger.info( + f"Model initialized on rank {rank} training device {device}\n{model}" + ) + + # We add a barrier to wait for all processes to finish preparing the dataloader and initializing the model prior to the start of training + torch.distributed.barrier() + + # Entering the training loop + training_start_time = time.time() + avg_train_loss = 0.0 + last_n_batch_avg_loss: list[float] = [] + last_n_batch_time: list[float] = [] + num_max_train_batches_per_process = args.num_max_train_batches // world_size + num_val_batches_per_process = args.num_val_batches // world_size + logger.info( + f"num_max_train_batches_per_process is set to {num_max_train_batches_per_process}" + ) + + model.train() + + # start_time gets updated every log_every_n_batch batches, batch_start gets updated every batch + batch_start = time.time() + for main_data, random_data in zip( + train_main_loader_iter, train_random_negative_loader_iter + ): + if batch_idx >= num_max_train_batches_per_process: + logger.info( + f"num_max_train_batches_per_process={num_max_train_batches_per_process} reached, " + f"stopping training on machine {args.machine_rank} local rank {local_rank}" + ) + break + loss = _compute_loss( + model=model, + main_data=main_data, + random_negative_data=random_data, + loss_fn=loss_fn, + supervision_edge_type=args.supervision_edge_type, + device=device, + ) + optimizer.zero_grad() + loss.backward() + optimizer.step() + avg_train_loss = _sync_metric_across_processes(metric=loss) + last_n_batch_avg_loss.append(avg_train_loss) + last_n_batch_time.append(time.time() - batch_start) + batch_start = time.time() + batch_idx += 1 + if batch_idx % args.log_every_n_batch == 0: + mean_batch_time = statistics.mean(last_n_batch_time) + mean_train_loss = statistics.mean(last_n_batch_avg_loss) + logger.info( + f"rank={rank}, batch={batch_idx}, latest local train_loss={loss:.6f}" + ) + if torch.cuda.is_available(): + # Wait for GPU operations to finish + torch.cuda.synchronize() + logger.info( + f"rank={rank}, batch={batch_idx}, mean(batch_time)={mean_batch_time:.3f} sec, max(batch_time)={max(last_n_batch_time):.3f} sec, min(batch_time)={min(last_n_batch_time):.3f} sec" + ) + tensorboard_writer.log( + { + "Time/batch_mean_sec": mean_batch_time, + "Loss/train": mean_train_loss, + }, + step=batch_idx, + ) + last_n_batch_time.clear() + # log the global average training loss + logger.info( + f"rank={rank}, latest avg_train_loss={avg_train_loss:.6f}, last {args.log_every_n_batch} mean(avg_train_loss)={mean_train_loss:.6f}" + ) + last_n_batch_avg_loss.clear() + + if batch_idx % args.val_every_n_batch == 0: + logger.info(f"rank={rank}, batch={batch_idx}, validating...") + model.eval() + global_avg_val_loss = _run_validation_loops( + model=model, + main_loader=val_main_loader_iter, + random_negative_loader=val_random_negative_loader_iter, + loss_fn=loss_fn, + supervision_edge_type=args.supervision_edge_type, + device=device, + log_every_n_batch=args.log_every_n_batch, + num_batches=num_val_batches_per_process, + ) + tensorboard_writer.log( + {"Loss/val": global_avg_val_loss}, step=batch_idx + ) + model.train() + + logger.info(f"---Rank {rank} finished training") # Memory cleanup and waiting for all processes to finish if torch.cuda.is_available(): @@ -646,34 +564,108 @@ def _training_process( torch.cuda.synchronize() # Ensures all CUDA operations have finished torch.distributed.barrier() # Waits for all processes to reach the current point - test_main_loader.shutdown() - test_random_negative_loader.shutdown() - - # Write eval metrics on the lead process only - # These get written to some JSON uder the gcs:////trainer/trainer_eval_metrics.json - # And then the "Log Trainer Eval Metrics" component in the KFP pipeline UI will log them to the UI, - # as a metrics artifact. - if ( - args.machine_rank == 0 - and local_rank == 0 - and args.eval_metrics_uri is not None - ): - eval_metrics = EvalMetricsCollection( - metrics=[ - EvalMetric.from_eval_metric_type( - EvalMetricType.loss, global_avg_test_loss - ) - ] + # We explicitly shutdown all the dataloaders to reduce their memory footprint. Otherwise, experimentally we have + # observed that not all memory may be cleaned up, leading to OOM. + train_main_loader.shutdown() + train_random_negative_loader.shutdown() + val_main_loader.shutdown() + val_random_negative_loader.shutdown() + + # We save the model on the process with the 0th node rank and 0th local rank. + if args.machine_rank == 0 and local_rank == 0: + logger.info( + f"Training loop finished, took {time.time() - training_start_time:.3f} seconds, saving model to {args.model_uri}" ) - write_eval_metrics_to_uri( - eval_metrics=eval_metrics, eval_metrics_uri=args.eval_metrics_uri + # We unwrap the model from DDP to save it + # We do this so we can use the model without DDP later, e.g. for inference. + save_state_dict( + model=model.unwrap_from_ddp(), save_to_path_uri=args.model_uri ) + else: # should_skip_training is True, meaning we should only run testing + state_dict = load_state_dict_from_uri( + load_from_uri=args.model_uri, device=device + ) + model = init_example_gigl_heterogeneous_model( + node_type_to_feature_dim=args.node_type_to_feature_dim, + edge_type_to_feature_dim=args.edge_type_to_feature_dim, + hid_dim=args.hid_dim, + out_dim=args.out_dim, + device=device, + wrap_with_ddp=True, + # Find unused parameters in the encoder. + # We do this as the encoder model is initialized with all edge types in the graph, but the training task only uses a subset of them. + find_unused_encoder_parameters=True, + state_dict=state_dict, # We load the model state dict for testing + ) logger.info( - f"---Rank {rank} finished testing in {time.time() - testing_start_time:.3f} seconds" + f"Model initialized on rank {rank} training device {device}\n{model}" + ) + + logger.info(f"---Rank {rank} started testing") + testing_start_time = time.time() + + model.eval() + + test_main_loader, test_random_negative_loader = _setup_dataloaders( + dataset=args.dataset, + split="test", + supervision_edge_type=args.supervision_edge_type, + num_neighbors=args.num_neighbors, + sampling_workers_per_process=args.sampling_workers_per_process, + main_batch_size=args.main_batch_size, + random_batch_size=args.random_batch_size, + device=device, + sampling_worker_shared_channel_size=args.sampling_worker_shared_channel_size, + process_start_gap_seconds=args.process_start_gap_seconds, + ) + + # We keep track of both the dataloader and the iterator for it + # so we can clean up resources from the dataloader later. + # Since we are doing testing, we only want to go through the data once, so we use iter instead of InfiniteIterator. + test_main_loader_iter = iter(test_main_loader) + test_random_negative_loader_iter = iter(test_random_negative_loader) + + global_avg_test_loss = _run_validation_loops( + model=model, + main_loader=test_main_loader_iter, + random_negative_loader=test_random_negative_loader_iter, + loss_fn=loss_fn, + supervision_edge_type=args.supervision_edge_type, + device=device, + log_every_n_batch=args.log_every_n_batch, + ) + tensorboard_writer.log({"Loss/test": global_avg_test_loss}, step=batch_idx) + + # Memory cleanup and waiting for all processes to finish + if torch.cuda.is_available(): + torch.cuda.empty_cache() # Releases all unoccupied cached memory currently held by the caching allocator on the CUDA-enabled GPU + torch.cuda.synchronize() # Ensures all CUDA operations have finished + torch.distributed.barrier() # Waits for all processes to reach the current point + + test_main_loader.shutdown() + test_random_negative_loader.shutdown() + + # Write eval metrics on the lead process only + # These get written to some JSON uder the gcs:////trainer/trainer_eval_metrics.json + # And then the "Log Trainer Eval Metrics" component in the KFP pipeline UI will log them to the UI, + # as a metrics artifact. + if args.machine_rank == 0 and local_rank == 0 and args.eval_metrics_uri is not None: + eval_metrics = EvalMetricsCollection( + metrics=[ + EvalMetric.from_eval_metric_type( + EvalMetricType.loss, global_avg_test_loss + ) + ] + ) + write_eval_metrics_to_uri( + eval_metrics=eval_metrics, eval_metrics_uri=args.eval_metrics_uri ) - finally: - tensorboard_writer.close() + + logger.info( + f"---Rank {rank} finished testing in {time.time() - testing_start_time:.3f} seconds" + ) + tensorboard_writer.close() torch.distributed.destroy_process_group() diff --git a/examples/link_prediction/homogeneous_training.py b/examples/link_prediction/homogeneous_training.py index cf0cee582..6470ab1ef 100644 --- a/examples/link_prediction/homogeneous_training.py +++ b/examples/link_prediction/homogeneous_training.py @@ -363,211 +363,36 @@ def _training_process( is_chief_process = args.machine_rank == 0 and local_rank == 0 tensorboard_writer = TensorBoardWriter.from_env(enabled=is_chief_process) - try: - loss_fn = RetrievalLoss( - loss=torch.nn.CrossEntropyLoss(reduction="mean"), - temperature=0.07, - remove_accidental_hits=True, - ) - batch_idx = 0 - - if not args.should_skip_training: - train_main_loader, train_random_negative_loader = _setup_dataloaders( - dataset=args.dataset, - split="train", - num_neighbors=args.num_neighbors, - sampling_workers_per_process=args.sampling_workers_per_process, - main_batch_size=args.main_batch_size, - random_batch_size=args.random_batch_size, - device=device, - sampling_worker_shared_channel_size=args.sampling_worker_shared_channel_size, - process_start_gap_seconds=args.process_start_gap_seconds, - ) - - # We keep track of both the dataloader and the iterator for it - # so we can clean up resources from the dataloader later. - train_main_loader_iter = InfiniteIterator(train_main_loader) - train_random_negative_loader_iter = InfiniteIterator( - train_random_negative_loader - ) - - val_main_loader, val_random_negative_loader = _setup_dataloaders( - dataset=args.dataset, - split="val", - num_neighbors=args.num_neighbors, - sampling_workers_per_process=args.sampling_workers_per_process, - main_batch_size=args.main_batch_size, - random_batch_size=args.random_batch_size, - device=device, - sampling_worker_shared_channel_size=args.sampling_worker_shared_channel_size, - process_start_gap_seconds=args.process_start_gap_seconds, - ) - - # We keep track of both the dataloader and the iterator for it - # so we can clean up resources from the dataloader later. - val_main_loader_iter = InfiniteIterator(val_main_loader) - val_random_negative_loader_iter = InfiniteIterator( - val_random_negative_loader - ) - - model = init_example_gigl_homogeneous_model( - node_feature_dim=args.node_feature_dim, - edge_feature_dim=args.edge_feature_dim, - hid_dim=args.hid_dim, - out_dim=args.out_dim, - device=device, - wrap_with_ddp=True, # We initialize the model for DDP - # Find unused parameters in the encoder. - # We do this as the encoder model is initialized with all edge types in the graph, but the training task only uses a subset of them. - find_unused_encoder_parameters=True, - ) - - optimizer = torch.optim.AdamW( - params=model.parameters(), - lr=args.learning_rate, - weight_decay=args.weight_decay, - ) - logger.info( - f"Model initialized on rank {rank} training device {device}\n{model}" - ) - - # We add a barrier to wait for all processes to finish preparing the dataloader and initializing the model prior to the start of training - torch.distributed.barrier() - - # Entering the training loop - training_start_time = time.time() - avg_train_loss = 0.0 - last_n_batch_avg_loss: list[float] = [] - last_n_batch_time: list[float] = [] - num_max_train_batches_per_process = args.num_max_train_batches // world_size - num_val_batches_per_process = args.num_val_batches // world_size - logger.info( - f"num_max_train_batches_per_process is set to {num_max_train_batches_per_process}" - ) - - model.train() + loss_fn = RetrievalLoss( + loss=torch.nn.CrossEntropyLoss(reduction="mean"), + temperature=0.07, + remove_accidental_hits=True, + ) + batch_idx = 0 - # start_time gets updated every log_every_n_batch batches, batch_start gets updated every batch - batch_start = time.time() - for main_data, random_data in zip( - train_main_loader_iter, train_random_negative_loader_iter - ): - if batch_idx >= num_max_train_batches_per_process: - logger.info( - f"num_max_train_batches_per_process={num_max_train_batches_per_process} reached, " - f"stopping training on machine {args.machine_rank} local rank {local_rank}" - ) - break - loss = _compute_loss( - model=model, - main_data=main_data, - random_negative_data=random_data, - loss_fn=loss_fn, - device=device, - ) - optimizer.zero_grad() - loss.backward() - optimizer.step() - avg_train_loss = _sync_metric_across_processes(metric=loss) - last_n_batch_avg_loss.append(avg_train_loss) - last_n_batch_time.append(time.time() - batch_start) - batch_start = time.time() - batch_idx += 1 - if batch_idx % args.log_every_n_batch == 0: - mean_batch_time = statistics.mean(last_n_batch_time) - mean_train_loss = statistics.mean(last_n_batch_avg_loss) - logger.info( - f"rank={rank}, batch={batch_idx}, latest local train_loss={loss:.6f}" - ) - if torch.cuda.is_available(): - # Wait for GPU operations to finish - torch.cuda.synchronize() - logger.info( - f"rank={rank}, mean(batch_time)={mean_batch_time:.3f} sec, max(batch_time)={max(last_n_batch_time):.3f} sec, min(batch_time)={min(last_n_batch_time):.3f} sec" - ) - tensorboard_writer.log( - { - "Time/batch_mean_sec": mean_batch_time, - "Loss/train": mean_train_loss, - }, - step=batch_idx, - ) - last_n_batch_time.clear() - # log the global average training loss - logger.info( - f"rank={rank}, latest avg_train_loss={avg_train_loss:.6f}, last {args.log_every_n_batch} mean(avg_train_loss)={mean_train_loss:.6f}" - ) - last_n_batch_avg_loss.clear() - - if batch_idx % args.val_every_n_batch == 0: - logger.info(f"rank={rank}, batch={batch_idx}, validating...") - model.eval() - global_avg_val_loss = _run_validation_loops( - model=model, - main_loader=val_main_loader_iter, - random_negative_loader=val_random_negative_loader_iter, - loss_fn=loss_fn, - device=device, - log_every_n_batch=args.log_every_n_batch, - num_batches=num_val_batches_per_process, - ) - tensorboard_writer.log( - {"Loss/val": global_avg_val_loss}, step=batch_idx - ) - model.train() - - logger.info(f"---Rank {rank} finished training") - - # Memory cleanup and waiting for all processes to finish - if torch.cuda.is_available(): - torch.cuda.empty_cache() # Releases all unoccupied cached memory currently held by the caching allocator on the CUDA-enabled GPU - torch.cuda.synchronize() # Ensures all CUDA operations have finished - torch.distributed.barrier() # Waits for all processes to reach the current point - - # We explicitly shutdown all the dataloaders to reduce their memory footprint. Otherwise, experimentally we have - # observed that not all memory may be cleaned up, leading to OOM. - train_main_loader.shutdown() - train_random_negative_loader.shutdown() - val_main_loader.shutdown() - val_random_negative_loader.shutdown() - - # We save the model on the process with the 0th node rank and 0th local rank. - if args.machine_rank == 0 and local_rank == 0: - logger.info( - f"Training loop finished, took {time.time() - training_start_time:.3f} seconds, saving model to {args.model_uri}" - ) - # We unwrap the model from DDP to save it - # We do this so we can use the model without DDP later, e.g. for inference. - save_state_dict( - model=model.unwrap_from_ddp(), save_to_path_uri=args.model_uri - ) - else: # should_skip_training is True, meaning we should only run testing - state_dict = load_state_dict_from_uri( - load_from_uri=args.model_uri, device=device - ) - model = init_example_gigl_homogeneous_model( - node_feature_dim=args.node_feature_dim, - edge_feature_dim=args.edge_feature_dim, - hid_dim=args.hid_dim, - out_dim=args.out_dim, - device=device, - wrap_with_ddp=True, # We initialize the model for DDP - # Find unused parameters in the encoder. - # We do this as the encoder model is initialized with all edge types in the graph, but the training task only uses a subset of them. - find_unused_encoder_parameters=True, - state_dict=state_dict, # We load the model state dict for testing - ) - logger.info( - f"Model initialized on rank {rank} training device {device}\n{model}" - ) + if not args.should_skip_training: + train_main_loader, train_random_negative_loader = _setup_dataloaders( + dataset=args.dataset, + split="train", + num_neighbors=args.num_neighbors, + sampling_workers_per_process=args.sampling_workers_per_process, + main_batch_size=args.main_batch_size, + random_batch_size=args.random_batch_size, + device=device, + sampling_worker_shared_channel_size=args.sampling_worker_shared_channel_size, + process_start_gap_seconds=args.process_start_gap_seconds, + ) - logger.info(f"---Rank {rank} started testing") - testing_start_time = time.time() - model.eval() + # We keep track of both the dataloader and the iterator for it + # so we can clean up resources from the dataloader later. + train_main_loader_iter = InfiniteIterator(train_main_loader) + train_random_negative_loader_iter = InfiniteIterator( + train_random_negative_loader + ) - test_main_loader, test_random_negative_loader = _setup_dataloaders( + val_main_loader, val_random_negative_loader = _setup_dataloaders( dataset=args.dataset, - split="test", + split="val", num_neighbors=args.num_neighbors, sampling_workers_per_process=args.sampling_workers_per_process, main_batch_size=args.main_batch_size, @@ -579,19 +404,116 @@ def _training_process( # We keep track of both the dataloader and the iterator for it # so we can clean up resources from the dataloader later. - # Since we are doing testing, we only want to go through the data once, so we use iter instead of InfiniteIterator. - test_main_loader_iter = iter(test_main_loader) - test_random_negative_loader_iter = iter(test_random_negative_loader) - - global_avg_test_loss = _run_validation_loops( - model=model, - main_loader=test_main_loader_iter, - random_negative_loader=test_random_negative_loader_iter, - loss_fn=loss_fn, + val_main_loader_iter = InfiniteIterator(val_main_loader) + val_random_negative_loader_iter = InfiniteIterator(val_random_negative_loader) + + model = init_example_gigl_homogeneous_model( + node_feature_dim=args.node_feature_dim, + edge_feature_dim=args.edge_feature_dim, + hid_dim=args.hid_dim, + out_dim=args.out_dim, device=device, - log_every_n_batch=args.log_every_n_batch, + wrap_with_ddp=True, # We initialize the model for DDP + # Find unused parameters in the encoder. + # We do this as the encoder model is initialized with all edge types in the graph, but the training task only uses a subset of them. + find_unused_encoder_parameters=True, ) - tensorboard_writer.log({"Loss/test": global_avg_test_loss}, step=batch_idx) + + optimizer = torch.optim.AdamW( + params=model.parameters(), + lr=args.learning_rate, + weight_decay=args.weight_decay, + ) + logger.info( + f"Model initialized on rank {rank} training device {device}\n{model}" + ) + + # We add a barrier to wait for all processes to finish preparing the dataloader and initializing the model prior to the start of training + torch.distributed.barrier() + + # Entering the training loop + training_start_time = time.time() + avg_train_loss = 0.0 + last_n_batch_avg_loss: list[float] = [] + last_n_batch_time: list[float] = [] + num_max_train_batches_per_process = args.num_max_train_batches // world_size + num_val_batches_per_process = args.num_val_batches // world_size + logger.info( + f"num_max_train_batches_per_process is set to {num_max_train_batches_per_process}" + ) + + model.train() + + # start_time gets updated every log_every_n_batch batches, batch_start gets updated every batch + batch_start = time.time() + for main_data, random_data in zip( + train_main_loader_iter, train_random_negative_loader_iter + ): + if batch_idx >= num_max_train_batches_per_process: + logger.info( + f"num_max_train_batches_per_process={num_max_train_batches_per_process} reached, " + f"stopping training on machine {args.machine_rank} local rank {local_rank}" + ) + break + loss = _compute_loss( + model=model, + main_data=main_data, + random_negative_data=random_data, + loss_fn=loss_fn, + device=device, + ) + optimizer.zero_grad() + loss.backward() + optimizer.step() + avg_train_loss = _sync_metric_across_processes(metric=loss) + last_n_batch_avg_loss.append(avg_train_loss) + last_n_batch_time.append(time.time() - batch_start) + batch_start = time.time() + batch_idx += 1 + if batch_idx % args.log_every_n_batch == 0: + mean_batch_time = statistics.mean(last_n_batch_time) + mean_train_loss = statistics.mean(last_n_batch_avg_loss) + logger.info( + f"rank={rank}, batch={batch_idx}, latest local train_loss={loss:.6f}" + ) + if torch.cuda.is_available(): + # Wait for GPU operations to finish + torch.cuda.synchronize() + logger.info( + f"rank={rank}, mean(batch_time)={mean_batch_time:.3f} sec, max(batch_time)={max(last_n_batch_time):.3f} sec, min(batch_time)={min(last_n_batch_time):.3f} sec" + ) + tensorboard_writer.log( + { + "Time/batch_mean_sec": mean_batch_time, + "Loss/train": mean_train_loss, + }, + step=batch_idx, + ) + last_n_batch_time.clear() + # log the global average training loss + logger.info( + f"rank={rank}, latest avg_train_loss={avg_train_loss:.6f}, last {args.log_every_n_batch} mean(avg_train_loss)={mean_train_loss:.6f}" + ) + last_n_batch_avg_loss.clear() + + if batch_idx % args.val_every_n_batch == 0: + logger.info(f"rank={rank}, batch={batch_idx}, validating...") + model.eval() + global_avg_val_loss = _run_validation_loops( + model=model, + main_loader=val_main_loader_iter, + random_negative_loader=val_random_negative_loader_iter, + loss_fn=loss_fn, + device=device, + log_every_n_batch=args.log_every_n_batch, + num_batches=num_val_batches_per_process, + ) + tensorboard_writer.log( + {"Loss/val": global_avg_val_loss}, step=batch_idx + ) + model.train() + + logger.info(f"---Rank {rank} finished training") # Memory cleanup and waiting for all processes to finish if torch.cuda.is_available(): @@ -599,34 +521,104 @@ def _training_process( torch.cuda.synchronize() # Ensures all CUDA operations have finished torch.distributed.barrier() # Waits for all processes to reach the current point - test_main_loader.shutdown() - test_random_negative_loader.shutdown() - - # Write eval metrics on the lead process only - # These get written to some JSON under the gcs:////trainer/trainer_eval_metrics.json - # And then the "Log Trainer Eval Metrics" component in the KFP pipeline UI will log them to the UI, - # as a metrics artifact. - if ( - args.machine_rank == 0 - and local_rank == 0 - and args.eval_metrics_uri is not None - ): - eval_metrics = EvalMetricsCollection( - metrics=[ - EvalMetric.from_eval_metric_type( - EvalMetricType.loss, global_avg_test_loss - ) - ] + # We explicitly shutdown all the dataloaders to reduce their memory footprint. Otherwise, experimentally we have + # observed that not all memory may be cleaned up, leading to OOM. + train_main_loader.shutdown() + train_random_negative_loader.shutdown() + val_main_loader.shutdown() + val_random_negative_loader.shutdown() + + # We save the model on the process with the 0th node rank and 0th local rank. + if args.machine_rank == 0 and local_rank == 0: + logger.info( + f"Training loop finished, took {time.time() - training_start_time:.3f} seconds, saving model to {args.model_uri}" ) - write_eval_metrics_to_uri( - eval_metrics=eval_metrics, eval_metrics_uri=args.eval_metrics_uri + # We unwrap the model from DDP to save it + # We do this so we can use the model without DDP later, e.g. for inference. + save_state_dict( + model=model.unwrap_from_ddp(), save_to_path_uri=args.model_uri ) - + else: # should_skip_training is True, meaning we should only run testing + state_dict = load_state_dict_from_uri( + load_from_uri=args.model_uri, device=device + ) + model = init_example_gigl_homogeneous_model( + node_feature_dim=args.node_feature_dim, + edge_feature_dim=args.edge_feature_dim, + hid_dim=args.hid_dim, + out_dim=args.out_dim, + device=device, + wrap_with_ddp=True, # We initialize the model for DDP + # Find unused parameters in the encoder. + # We do this as the encoder model is initialized with all edge types in the graph, but the training task only uses a subset of them. + find_unused_encoder_parameters=True, + state_dict=state_dict, # We load the model state dict for testing + ) logger.info( - f"---Rank {rank} finished testing in {time.time() - testing_start_time:.3f} seconds" + f"Model initialized on rank {rank} training device {device}\n{model}" + ) + + logger.info(f"---Rank {rank} started testing") + testing_start_time = time.time() + model.eval() + + test_main_loader, test_random_negative_loader = _setup_dataloaders( + dataset=args.dataset, + split="test", + num_neighbors=args.num_neighbors, + sampling_workers_per_process=args.sampling_workers_per_process, + main_batch_size=args.main_batch_size, + random_batch_size=args.random_batch_size, + device=device, + sampling_worker_shared_channel_size=args.sampling_worker_shared_channel_size, + process_start_gap_seconds=args.process_start_gap_seconds, + ) + + # We keep track of both the dataloader and the iterator for it + # so we can clean up resources from the dataloader later. + # Since we are doing testing, we only want to go through the data once, so we use iter instead of InfiniteIterator. + test_main_loader_iter = iter(test_main_loader) + test_random_negative_loader_iter = iter(test_random_negative_loader) + + global_avg_test_loss = _run_validation_loops( + model=model, + main_loader=test_main_loader_iter, + random_negative_loader=test_random_negative_loader_iter, + loss_fn=loss_fn, + device=device, + log_every_n_batch=args.log_every_n_batch, + ) + tensorboard_writer.log({"Loss/test": global_avg_test_loss}, step=batch_idx) + + # Memory cleanup and waiting for all processes to finish + if torch.cuda.is_available(): + torch.cuda.empty_cache() # Releases all unoccupied cached memory currently held by the caching allocator on the CUDA-enabled GPU + torch.cuda.synchronize() # Ensures all CUDA operations have finished + torch.distributed.barrier() # Waits for all processes to reach the current point + + test_main_loader.shutdown() + test_random_negative_loader.shutdown() + + # Write eval metrics on the lead process only + # These get written to some JSON under the gcs:////trainer/trainer_eval_metrics.json + # And then the "Log Trainer Eval Metrics" component in the KFP pipeline UI will log them to the UI, + # as a metrics artifact. + if args.machine_rank == 0 and local_rank == 0 and args.eval_metrics_uri is not None: + eval_metrics = EvalMetricsCollection( + metrics=[ + EvalMetric.from_eval_metric_type( + EvalMetricType.loss, global_avg_test_loss + ) + ] + ) + write_eval_metrics_to_uri( + eval_metrics=eval_metrics, eval_metrics_uri=args.eval_metrics_uri ) - finally: - tensorboard_writer.close() + + logger.info( + f"---Rank {rank} finished testing in {time.time() - testing_start_time:.3f} seconds" + ) + tensorboard_writer.close() torch.distributed.destroy_process_group() From dfbcc04f1edc20485eec5a1d6d08110704a83669 Mon Sep 17 00:00:00 2001 From: kmontemayor Date: Tue, 5 May 2026 23:55:20 +0000 Subject: [PATCH 52/59] tensorboard_writer: hoist aiplatform import to module top The local ``from google.cloud import aiplatform`` inside ``close()`` and ``_maybe_start_uploader`` was a defensive measure for environments without ``google-cloud-aiplatform``. It's a hard dependency here (declared in pyproject.toml and used by other modules at import time), so the local-import wrapping just adds noise. Co-Authored-By: Claude Opus 4.7 (1M context) --- gigl/utils/tensorboard_writer.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/gigl/utils/tensorboard_writer.py b/gigl/utils/tensorboard_writer.py index 6b39d76c9..5cc9350ad 100644 --- a/gigl/utils/tensorboard_writer.py +++ b/gigl/utils/tensorboard_writer.py @@ -5,6 +5,7 @@ from typing import Any, Final, Optional import tensorflow as tf +from google.cloud import aiplatform # Vertex AI sets this env var to ``/logs/`` (or # ``//logs/`` for HyperparameterTuningJob trials) @@ -170,10 +171,6 @@ def close(self) -> None: if self._writer is not None: self._writer.close() if self._upload_started: - # Local import keeps the optional aiplatform dependency out of - # the no-op path. - from google.cloud import aiplatform - aiplatform.end_upload_tb_log() self._closed = True @@ -219,9 +216,6 @@ def _maybe_start_uploader(*, parent_log_dir: str) -> bool: "configured on GiglResourceConfig." ) - # Local import: aiplatform is only needed when the user opts in. - from google.cloud import aiplatform - aiplatform.init( project=match["project"], location=match["location"], From 5d820605f4091381aa3d267dae37cececbb1b3a2 Mon Sep 17 00:00:00 2001 From: kmontemayor Date: Tue, 5 May 2026 23:55:26 +0000 Subject: [PATCH 53/59] tests: drop pure-mock test files MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit vertex_ai_test.py and glt_trainer_test.py were primarily exercising the test mocks rather than real behavior. The real behavior is covered by tensorboard_writer_test.py (writer + uploader lifecycle), vertex_ai_launcher_test.py (launcher → VertexAiJobConfig wiring), and gbml_and_resource_config_compatibility_checks_test.py (validation). Co-Authored-By: Claude Opus 4.7 (1M context) --- tests/unit/src/common/vertex_ai_test.py | 297 -------------------- tests/unit/src/training/glt_trainer_test.py | 111 -------- 2 files changed, 408 deletions(-) delete mode 100644 tests/unit/src/common/vertex_ai_test.py delete mode 100644 tests/unit/src/training/glt_trainer_test.py diff --git a/tests/unit/src/common/vertex_ai_test.py b/tests/unit/src/common/vertex_ai_test.py deleted file mode 100644 index 470005dac..000000000 --- a/tests/unit/src/common/vertex_ai_test.py +++ /dev/null @@ -1,297 +0,0 @@ -"""Unit tests for gigl.common.services.vertex_ai.""" - -from unittest.mock import Mock, patch - -from absl.testing import absltest - -from gigl.common.services.vertex_ai import ( - VertexAiJobConfig, - VertexAIService, - _build_tensorboard_experiment_url, -) -from tests.test_assets.test_case import TestCase - - -class TestVertexAIService(TestCase): - """Tests for Vertex AI CustomJob submission plumbing.""" - - @patch("gigl.common.services.vertex_ai.aiplatform.CustomJob") - @patch("gigl.common.services.vertex_ai.aiplatform.init") - def test_submit_job_passes_tensorboard_and_base_output_dir( - self, - mock_aiplatform_init, - mock_custom_job_class, - ) -> None: - mock_job = Mock() - mock_job.resource_name = "projects/test/locations/us-central1/customJobs/123" - mock_job.name = "123" - mock_custom_job_class.return_value = mock_job - - service = VertexAIService( - project="test-project", - location="us-central1", - service_account="svc@test-project.iam.gserviceaccount.com", - staging_bucket="gs://test-staging-bucket", - ) - - job_config = VertexAiJobConfig( - job_name="test-job", - container_uri="gcr.io/test/image:latest", - command=["python", "-m", "trainer"], - base_output_dir="gs://test-perm-bucket/test-job/trainer", - tensorboard_resource_name=( - "projects/test-project/locations/us-central1/tensorboards/123" - ), - ) - - service.launch_job(job_config=job_config) - - mock_aiplatform_init.assert_called_once_with( - project="test-project", - location="us-central1", - staging_bucket="gs://test-staging-bucket", - ) - mock_custom_job_class.assert_called_once() - _, custom_job_kwargs = mock_custom_job_class.call_args - self.assertEqual( - custom_job_kwargs["base_output_dir"], - job_config.base_output_dir, - ) - mock_job.submit.assert_called_once() - _, submit_kwargs = mock_job.submit.call_args - self.assertEqual( - submit_kwargs["tensorboard"], - job_config.tensorboard_resource_name, - ) - self.assertNotIn("experiment", submit_kwargs) - - def test_vertex_ai_job_config_carries_experiment_name(self) -> None: - cfg = VertexAiJobConfig( - job_name="job", - container_uri="gcr.io/p/img:tag", - command=["python", "-m", "x"], - tensorboard_resource_name="projects/p/locations/us/tensorboards/1", - tensorboard_experiment_name="my-comparison", - ) - self.assertEqual(cfg.tensorboard_experiment_name, "my-comparison") - - @patch("gigl.common.services.vertex_ai.aiplatform.CustomJob") - @patch("gigl.common.services.vertex_ai.aiplatform.init") - def test_submit_job_passes_tensorboard_with_or_without_experiment_name( - self, - mock_aiplatform_init: Mock, - mock_custom_job_class: Mock, - ) -> None: - """``tensorboard=`` is always passed when a TB resource is set, so the - VAI job page's "Open TensorBoard" link works. The chief-rank uploader - (driven by injected env vars) handles cross-job comparison separately. - """ - mock_job = Mock() - mock_job.resource_name = "projects/test/locations/us-central1/customJobs/456" - mock_job.name = "456" - mock_custom_job_class.return_value = mock_job - - service = VertexAIService( - project="test-project", - location="us-central1", - service_account="svc@test-project.iam.gserviceaccount.com", - staging_bucket="gs://test-staging-bucket", - ) - - job_config = VertexAiJobConfig( - job_name="test-job-exp", - container_uri="gcr.io/test/image:latest", - command=["python", "-m", "trainer"], - base_output_dir="gs://test-perm-bucket/test-job/trainer", - tensorboard_resource_name="projects/test/locations/us-central1/tensorboards/123", - tensorboard_experiment_name="my-comparison", - ) - - service.launch_job(job_config=job_config) - - mock_job.submit.assert_called_once() - submit_kwargs = mock_job.submit.call_args.kwargs - self.assertEqual( - submit_kwargs["tensorboard"], - "projects/test/locations/us-central1/tensorboards/123", - ) - self.assertNotIn("experiment", submit_kwargs) - self.assertNotIn("experiment_run", submit_kwargs) - - @patch("gigl.common.services.vertex_ai.aiplatform.CustomJob") - @patch("gigl.common.services.vertex_ai.aiplatform.init") - def test_submit_job_raises_when_experiment_name_set_but_no_tb_resource( - self, - mock_aiplatform_init: Mock, - mock_custom_job_class: Mock, - ) -> None: - """When tensorboard_experiment_name is set but tensorboard_resource_name is empty, raises ValueError.""" - mock_job = Mock() - mock_custom_job_class.return_value = mock_job - - service = VertexAIService( - project="test-project", - location="us-central1", - service_account="svc@test-project.iam.gserviceaccount.com", - staging_bucket="gs://test-staging-bucket", - ) - - job_config = VertexAiJobConfig( - job_name="test-job-no-tb", - container_uri="gcr.io/test/image:latest", - command=["python", "-m", "trainer"], - base_output_dir="gs://test-perm-bucket/test-job/trainer", - tensorboard_resource_name="", - tensorboard_experiment_name="my-comparison", - ) - - with self.assertRaises(ValueError) as ctx: - service.launch_job(job_config=job_config) - - self.assertIn("tensorboard_resource_name", str(ctx.exception)) - - -class TestSubmitJobValidatesExperimentName(TestCase): - """Tests that _submit_job validates the user-supplied experiment name.""" - - @patch("gigl.common.services.vertex_ai.aiplatform.CustomJob") - @patch("gigl.common.services.vertex_ai.aiplatform.init") - def test_invalid_experiment_name_raises( - self, - mock_aiplatform_init: Mock, - mock_custom_job_class: Mock, - ) -> None: - """User-supplied tensorboard_experiment_name must match Vertex's regex.""" - mock_job = Mock() - mock_custom_job_class.return_value = mock_job - - service = VertexAIService( - project="test-project", - location="us-central1", - service_account="svc@test-project.iam.gserviceaccount.com", - staging_bucket="gs://test-staging-bucket", - ) - - job_config = VertexAiJobConfig( - job_name="any-job", - container_uri="gcr.io/test/image:latest", - command=["python", "-m", "trainer"], - base_output_dir="gs://test-perm-bucket/run/trainer", - tensorboard_resource_name="projects/test/locations/us-central1/tensorboards/123", - tensorboard_experiment_name="Invalid_Experiment_Name", - ) - - with self.assertRaises(ValueError) as ctx: - service.launch_job(job_config=job_config) - - self.assertIn("tensorboard_experiment_name", str(ctx.exception)) - - -class TestBuildTensorboardExperimentUrl(TestCase): - """Tests for the small URL-builder helper used in submit-time logging.""" - - def test_builds_url_for_well_formed_resource_name(self) -> None: - url = _build_tensorboard_experiment_url( - tensorboard_resource_name="projects/p/locations/us-central1/tensorboards/42", - experiment_id="my-exp", - ) - self.assertEqual( - url, - "https://us-central1.tensorboard.googleusercontent.com/experiment/" - "projects+p+locations+us-central1+tensorboards+42+experiments+my-exp", - ) - - def test_returns_none_for_malformed_resource_name(self) -> None: - # A stray bad TB resource name should never break submission — the - # caller falls back to no URL log. - self.assertIsNone( - _build_tensorboard_experiment_url( - tensorboard_resource_name="not-a-resource-name", - experiment_id="my-exp", - ) - ) - - -class TestSubmitJobLogsTensorboardUrls(TestCase): - """Tests that _submit_job logs both per-job and cross-job TB URLs.""" - - @patch("gigl.common.services.vertex_ai.logger.info") - @patch("gigl.common.services.vertex_ai.aiplatform.CustomJob") - @patch("gigl.common.services.vertex_ai.aiplatform.init") - def test_logs_both_urls_when_experiment_name_set( - self, - mock_aiplatform_init: Mock, - mock_custom_job_class: Mock, - mock_logger_info: Mock, - ) -> None: - mock_job = Mock() - mock_job.resource_name = "projects/p/locations/us-central1/customJobs/9876" - mock_job.name = "9876" # numeric job ID - mock_custom_job_class.return_value = mock_job - - service = VertexAIService( - project="p", - location="us-central1", - service_account="svc@p.iam.gserviceaccount.com", - staging_bucket="gs://staging", - ) - job_config = VertexAiJobConfig( - job_name="my-job", - container_uri="gcr.io/p/img", - command=["python", "-m", "trainer"], - base_output_dir="gs://staging/my-job/trainer", - tensorboard_resource_name="projects/p/locations/us-central1/tensorboards/42", - tensorboard_experiment_name="my-exp", - ) - - service.launch_job(job_config=job_config) - - emitted = " ".join(call.args[0] for call in mock_logger_info.call_args_list) - # Per-job URL keyed on the job's numeric ID. - self.assertIn( - "experiments+9876", - emitted, - ) - # Cross-job URL keyed on the user-supplied experiment name. - self.assertIn( - "experiments+my-exp", - emitted, - ) - - @patch("gigl.common.services.vertex_ai.logger.info") - @patch("gigl.common.services.vertex_ai.aiplatform.CustomJob") - @patch("gigl.common.services.vertex_ai.aiplatform.init") - def test_logs_only_per_job_url_when_no_experiment_name( - self, - mock_aiplatform_init: Mock, - mock_custom_job_class: Mock, - mock_logger_info: Mock, - ) -> None: - mock_job = Mock() - mock_job.resource_name = "projects/p/locations/us-central1/customJobs/9876" - mock_job.name = "9876" - mock_custom_job_class.return_value = mock_job - - service = VertexAIService( - project="p", - location="us-central1", - service_account="svc@p.iam.gserviceaccount.com", - staging_bucket="gs://staging", - ) - job_config = VertexAiJobConfig( - job_name="my-job", - container_uri="gcr.io/p/img", - command=["python", "-m", "trainer"], - base_output_dir="gs://staging/my-job/trainer", - tensorboard_resource_name="projects/p/locations/us-central1/tensorboards/42", - ) - - service.launch_job(job_config=job_config) - - emitted = " ".join(call.args[0] for call in mock_logger_info.call_args_list) - self.assertIn("experiments+9876", emitted) - self.assertNotIn("cross-job comparison", emitted) - - -if __name__ == "__main__": - absltest.main() diff --git a/tests/unit/src/training/glt_trainer_test.py b/tests/unit/src/training/glt_trainer_test.py deleted file mode 100644 index 1a6d246ff..000000000 --- a/tests/unit/src/training/glt_trainer_test.py +++ /dev/null @@ -1,111 +0,0 @@ -"""Unit tests for GLTTrainer dispatch. - -The trainer no longer extracts ``tensorboard_experiment_name`` from -``GbmlConfig``; that field now lives on ``VertexAiResourceConfig`` and the -launcher reads it directly. These tests confirm the trainer dispatches to -the right launcher based on ``trainer_config`` type. -""" - -from unittest.mock import MagicMock, patch - -from gigl.common import UriFactory -from gigl.src.common.types import AppliedTaskIdentifier -from gigl.src.training.v2.glt_trainer import GLTTrainer -from snapchat.research.gbml import gbml_config_pb2, gigl_resource_config_pb2 -from tests.test_assets.test_case import TestCase - - -def _make_resource_config_wrapper_with_single_pool() -> MagicMock: - """Return a GiglResourceConfigWrapper mock backed by a VertexAiResourceConfig.""" - vertex_ai_config = gigl_resource_config_pb2.VertexAiResourceConfig( - machine_type="n1-standard-8", - num_replicas=1, - timeout=7200, - ) - mock_wrapper = MagicMock() - mock_wrapper.trainer_config = vertex_ai_config - mock_wrapper.vertex_ai_trainer_region = "us-central1" - return mock_wrapper - - -def _make_resource_config_wrapper_with_graph_store() -> MagicMock: - """Return a GiglResourceConfigWrapper mock backed by a VertexAiGraphStoreConfig.""" - compute_pool = gigl_resource_config_pb2.VertexAiResourceConfig( - machine_type="n1-standard-16", - num_replicas=1, - ) - storage_pool = gigl_resource_config_pb2.VertexAiResourceConfig( - machine_type="n1-highmem-32", - num_replicas=1, - ) - graph_store_config = gigl_resource_config_pb2.VertexAiGraphStoreConfig( - compute_pool=compute_pool, - graph_store_pool=storage_pool, - compute_cluster_local_world_size=4, - ) - mock_wrapper = MagicMock() - mock_wrapper.trainer_config = graph_store_config - return mock_wrapper - - -def _make_gbml_config_pb_wrapper() -> MagicMock: - """Return a minimal GbmlConfigPbWrapper mock for trainer dispatch.""" - trainer_config_proto = gbml_config_pb2.GbmlConfig.TrainerConfig( - command="python -m gigl.src.training.v2.glt_trainer", - ) - mock_wrapper = MagicMock() - mock_wrapper.trainer_config = trainer_config_proto - mock_wrapper.shared_config.trained_model_metadata.tensorboard_logs_uri = "" - return mock_wrapper - - -class TestGltTrainerDispatch(TestCase): - """Tests that GLTTrainer dispatches to the correct launcher entry point.""" - - @patch("gigl.src.training.v2.glt_trainer.launch_single_pool_job") - @patch("gigl.src.training.v2.glt_trainer.GbmlConfigPbWrapper") - @patch("gigl.src.training.v2.glt_trainer.get_resource_config") - def test_single_pool_resource_config_dispatches_to_single_pool_launcher( - self, - mock_get_resource_config: MagicMock, - mock_gbml_config_cls: MagicMock, - mock_launch_single_pool_job: MagicMock, - ) -> None: - mock_get_resource_config.return_value = ( - _make_resource_config_wrapper_with_single_pool() - ) - mock_gbml_config_cls.get_gbml_config_pb_wrapper_from_uri.return_value = ( - _make_gbml_config_pb_wrapper() - ) - - GLTTrainer().run( - applied_task_identifier=AppliedTaskIdentifier("test-job"), - task_config_uri=UriFactory.create_uri("gs://bucket/task.yaml"), - resource_config_uri=UriFactory.create_uri("gs://bucket/resource.yaml"), - ) - - mock_launch_single_pool_job.assert_called_once() - - @patch("gigl.src.training.v2.glt_trainer.launch_graph_store_enabled_job") - @patch("gigl.src.training.v2.glt_trainer.GbmlConfigPbWrapper") - @patch("gigl.src.training.v2.glt_trainer.get_resource_config") - def test_graph_store_resource_config_dispatches_to_graph_store_launcher( - self, - mock_get_resource_config: MagicMock, - mock_gbml_config_cls: MagicMock, - mock_launch_graph_store_enabled_job: MagicMock, - ) -> None: - mock_get_resource_config.return_value = ( - _make_resource_config_wrapper_with_graph_store() - ) - mock_gbml_config_cls.get_gbml_config_pb_wrapper_from_uri.return_value = ( - _make_gbml_config_pb_wrapper() - ) - - GLTTrainer().run( - applied_task_identifier=AppliedTaskIdentifier("test-job"), - task_config_uri=UriFactory.create_uri("gs://bucket/task.yaml"), - resource_config_uri=UriFactory.create_uri("gs://bucket/resource.yaml"), - ) - - mock_launch_graph_store_enabled_job.assert_called_once() From 8d8c9b83b7785150befd6ed6e7ee5286eb666c25 Mon Sep 17 00:00:00 2001 From: kmontemayor Date: Wed, 6 May 2026 16:07:19 +0000 Subject: [PATCH 54/59] docs: plan to drop submit(tensorboard=) and collapse to single uploader Stacked on top of PR #603. Plan removes the dual-uploader design once PR #603 lands: keeps the chief-rank ``aiplatform.start_upload_tb_log`` streamer, drops Vertex's auto-uploader (and with it the per-job "Open TensorBoard" button on the Vertex UI). Trainer stdout will log the named-experiment URL to compensate. Step 0's constraint check (does AIP_TENSORBOARD_LOG_DIR survive without submit(tensorboard=)) is resolved against Vertex's training-code-requirements docs: yes, the env var is tied to baseOutputDirectory alone. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../plans/20260506-drop-submit-tensorboard.md | 178 ++++++++++++++++++ 1 file changed, 178 insertions(+) create mode 100644 docs/plans/20260506-drop-submit-tensorboard.md diff --git a/docs/plans/20260506-drop-submit-tensorboard.md b/docs/plans/20260506-drop-submit-tensorboard.md new file mode 100644 index 000000000..c01605f95 --- /dev/null +++ b/docs/plans/20260506-drop-submit-tensorboard.md @@ -0,0 +1,178 @@ +# Drop `submit(tensorboard=...)`, single-uploader TB + +Date: 2026-05-06 +Predecessor PR: https://github.com/Snapchat/GiGL/pull/603 + +## Goal + +Eliminate Vertex AI's auto-uploader. Keep only the chief-rank +`aiplatform.start_upload_tb_log` uploader for both live streaming and multi-run +comparison. Single uploader, single experiment, less plumbing. + +## Why + +PR #603 ships a dual-uploader design: Vertex's auto-uploader (gated on +`submit(tensorboard=...)`) plus a chief-rank `start_upload_tb_log` uploader. +That's because the SDK forces `submit(tensorboard=)` and `submit(experiment=)` +to be mutually exclusive, so getting both R1 (per-job UI link) and R2 (multi-run +comparison) required two parallel uploaders streaming from the same log dir. + +We want to keep streaming and multi-run comparison, but we don't actually need +R1 (the "Open TensorBoard" button on the Vertex job page) — we can replace it +with a logged URL in trainer stdout. Dropping `submit(tensorboard=...)` removes +the dual-uploader oddity and most of the supporting plumbing in +`vertex_ai.py`. + +## Step 0 — Constraint check (resolved via docs) + +**Question:** does Vertex AI populate `AIP_TENSORBOARD_LOG_DIR` inside the +worker container when `baseOutputDirectory` is set on `CustomJobSpec` but +`submit(tensorboard=)` is NOT? + +**Answer: yes.** Vertex's training-code-requirements doc +(https://cloud.google.com/vertex-ai/docs/training/code-requirements) is +unambiguous: when `baseOutputDirectory` is configured, Vertex AI sets +`AIP_MODEL_DIR`, `AIP_CHECKPOINT_DIR`, and `AIP_TENSORBOARD_LOG_DIR` env vars +unconditionally. The `tensorboard` field on `CustomJobSpec` is not a +prerequisite. **Step 4 below is not required** and is dropped from this plan. + +(If smoke testing later reveals a discrepancy, Step 4 can be re-introduced as +a fallback.) + +## Step 1 — Tighten validation: both fields or neither + +**File:** `gigl/src/validation_check/libs/gbml_and_resource_config_compatibility_checks.py` + +In `check_vertex_ai_trainer_tensorboard_compatibility`, replace the current +"experiment name requires resource name" rule with "both must be set together +(or both unset)": + +- If exactly one of `tensorboard_resource_name` / + `tensorboard_experiment_name` is set, raise. +- Add the Vertex resource-ID regex check on `tensorboard_experiment_name` here + (moved from `_submit_job`). + +This shifts the precondition out of submit-time into the validation-check +stage, where the rest of the resource-config rules live. + +**Backwards compat:** zero risk. Both proto fields landed in PR #603 (this +branch); neither exists on `main`. No production config has +`tensorboard_resource_name` set without `tensorboard_experiment_name`. + +**File:** `tests/unit/src/validation/lib/gbml_and_resource_config_compatibility_checks_test.py` +- Add a "resource_name set, experiment_name unset" failure test. +- Existing "experiment_name set, resource_name unset" failure test stays. +- Add a regex-failure test for an invalid experiment name. + +## Step 2 — Drop the `submit(tensorboard=...)` path + +**File:** `gigl/common/services/vertex_ai.py` + +In `_submit_job` (around lines 411-440): +- Delete `tensorboard=job_config.tensorboard_resource_name or None` kwarg from + `job.submit(...)`. +- Delete the experiment-name regex precondition block (lines 411-424); moved + to validation in Step 1. + +URL logging (lines 450-470): +- Delete the per-job URL log (lines 450-459). +- Keep the cross-job URL log (lines 460-470). Validation now guarantees both + names are present whenever either is, so the inner `if` simplifies. + +`VertexAiJobConfig` (around lines 213-214): +- Delete `tensorboard_resource_name` and `tensorboard_experiment_name` fields. + They were carriers from launcher into `_submit_job`; nothing reads them now. + +`_VERTEX_RESOURCE_ID_PATTERN` constant: delete from this file (only used by +validation now, which has its own copy or imports it). + +## Step 3 — Stop wiring TB names into VertexAiJobConfig (launcher) + +**File:** `gigl/src/common/vertex_ai_launcher.py` + +- `_build_job_config` (around lines 405-412): drop `tensorboard_resource_name=...` + and `tensorboard_experiment_name=...` kwargs to `VertexAiJobConfig`. +- Env-var injection block (lines 339-369): keep. The "both set" guard at line + 357-358 simplifies — since validation now enforces all-or-nothing, it's + exactly one condition (either field set implies both). +- `baseOutputDirectory` plumbing: unchanged. + +## Step 4 — Surface the named-experiment URL where users will see it + +**File:** `gigl/utils/tensorboard_writer.py` + +In `_maybe_start_uploader`, after `aiplatform.start_upload_tb_log(...)` +succeeds, log the cross-job experiment URL using the same format as +`vertex_ai.py:_build_tensorboard_experiment_url`. Either move that helper to a +shared location (`gigl/common/services/vertex_ai_url_helpers.py` or similar) +or inline the format string in the writer — it's three lines, duplication is +fine. + +Compensates for losing the Vertex UI's "Open TensorBoard" button by putting +the link in trainer stdout, where engineers already look. + +## Step 5 — Tests + +**File:** `tests/unit/src/common/vertex_ai_launcher_test.py` +- Drop assertions on `cfg.tensorboard_resource_name` / + `cfg.tensorboard_experiment_name` (the dataclass fields are gone). Env-var + injection assertions stay and become the primary contract test. + +**File:** `tests/unit/utils/tensorboard_writer_test.py` +- Add coverage for the URL log line emitted by `_maybe_start_uploader` on + success (Step 4). + +## Step 6 — Verification + +- `make type_check` clean. +- Per-file: `make unit_test_py PY_TEST_FILES="vertex_ai_launcher_test.py"`, + `tensorboard_writer_test.py`, + `gbml_and_resource_config_compatibility_checks_test.py`. +- Smoke: rerun the same two-runs-on-one-experiment smoke from PR #603. Confirm: + - Vertex job page no longer renders a TB button (expected regression). + - Trainer stdout logs the named-experiment URL. + - Both runs land on the same TB page side-by-side. + - `printenv | grep AIP_` confirms `AIP_TENSORBOARD_LOG_DIR` is set even + without `submit(tensorboard=)` (sanity check on the Step 0 doc claim). +- Full e2e CORA pipeline regression. + +## Risk and rollback + +- **Step 0's claim is load-bearing.** Resolved via docs, but the smoke run in + Step 6 should cross-check `AIP_TENSORBOARD_LOG_DIR` actually appears in the + worker container before relying on it in production. +- **UX regression on the Vertex UI button.** Mitigated by Step 4's stdout + logging. Call out in the PR description so reviewers aren't surprised. +- **Rollback:** single PR, easy to revert. Proto is unchanged; both fields + stay as carriers for the chief-rank uploader. Reverting just adds back the + `submit(tensorboard=...)` kwarg and the dropped `VertexAiJobConfig` fields. + +## Critical files + +- `gigl/common/services/vertex_ai.py` — drop submit kwarg, drop dataclass + fields, drop URL helpers (Step 2). +- `gigl/src/common/vertex_ai_launcher.py` — drop dataclass kwargs (Step 3). +- `gigl/utils/tensorboard_writer.py` — surface URL on uploader start (Step 4). +- `gigl/src/validation_check/libs/gbml_and_resource_config_compatibility_checks.py` + — tighten to all-or-nothing + regex check (Step 1). +- Tests under `tests/unit/src/common/`, `tests/unit/utils/`, + `tests/unit/src/validation/lib/`. + +## Out of scope + +- Structured "trainer output metadata" file for KFP UI surfacing of the TB + URL. Considered useful but separate; defer. +- Removing `tensorboard_resource_name` field entirely. The chief-rank uploader + needs it (it's how `start_upload_tb_log` knows which `Tensorboard` instance + to write to), so the field stays. + +## References + +- Vertex AI training code requirements (env vars): + https://cloud.google.com/vertex-ai/docs/training/code-requirements +- `CustomJobSpec` REST (`baseOutputDirectory`): + https://cloud.google.com/vertex-ai/docs/reference/rest/v1/CustomJobSpec +- TB data model: + https://cloud.google.com/vertex-ai/docs/experiments/tensorboard-overview +- `aiplatform.start_upload_tb_log`: + https://cloud.google.com/python/docs/reference/aiplatform/latest/google.cloud.aiplatform#google_cloud_aiplatform_start_upload_tb_log From fe835b3b3b250fa2bd2641c2933cdf14f24c6892 Mon Sep 17 00:00:00 2001 From: kmontemayor Date: Wed, 6 May 2026 16:11:10 +0000 Subject: [PATCH 55/59] validation: require tensorboard_resource_name and experiment_name together The chief-rank uploader needs both fields to call ``aiplatform.start_upload_tb_log``; setting only one is now caught at config-validation time instead of producing a silent no-op (or a submit-time crash). Also moves the Vertex AI Experiment-ID regex check from ``vertex_ai.py:_submit_job`` into the validation pass so the precondition fires earlier in the pipeline. Existing example configs that set ``tensorboard_resource_name`` only are updated to also set ``tensorboard_experiment_name``. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../configs/example_resource_config.yaml | 1 + .../configs/example_resource_config.yaml | 1 + ...nd_resource_config_compatibility_checks.py | 51 ++++++++++++++----- ...source_config_compatibility_checks_test.py | 41 +++++++++++++-- 4 files changed, 79 insertions(+), 15 deletions(-) diff --git a/examples/link_prediction/configs/example_resource_config.yaml b/examples/link_prediction/configs/example_resource_config.yaml index b24557f9a..0d8531215 100644 --- a/examples/link_prediction/configs/example_resource_config.yaml +++ b/examples/link_prediction/configs/example_resource_config.yaml @@ -44,6 +44,7 @@ trainer_resource_config: gpu_limit: 2 num_replicas: 2 tensorboard_resource_name: "projects/USER_PROVIDED_PROJECT/locations/us-central1/tensorboards/USER_PROVIDED_TENSORBOARD_ID" + tensorboard_experiment_name: "USER_PROVIDED_EXPERIMENT_NAME" inferencer_resource_config: vertex_ai_inferencer_config: machine_type: n1-standard-16 diff --git a/examples/link_prediction/graph_store/configs/example_resource_config.yaml b/examples/link_prediction/graph_store/configs/example_resource_config.yaml index a06f3192a..68929311e 100644 --- a/examples/link_prediction/graph_store/configs/example_resource_config.yaml +++ b/examples/link_prediction/graph_store/configs/example_resource_config.yaml @@ -59,6 +59,7 @@ trainer_resource_config: gpu_limit: 2 num_replicas: 2 tensorboard_resource_name: "projects/USER_PROVIDED_PROJECT/locations/us-central1/tensorboards/USER_PROVIDED_TENSORBOARD_ID" + tensorboard_experiment_name: "USER_PROVIDED_EXPERIMENT_NAME" inferencer_resource_config: vertex_ai_graph_store_inferencer_config: graph_store_pool: diff --git a/gigl/src/validation_check/libs/gbml_and_resource_config_compatibility_checks.py b/gigl/src/validation_check/libs/gbml_and_resource_config_compatibility_checks.py index 2703915d1..e79b2a2a5 100644 --- a/gigl/src/validation_check/libs/gbml_and_resource_config_compatibility_checks.py +++ b/gigl/src/validation_check/libs/gbml_and_resource_config_compatibility_checks.py @@ -5,7 +5,8 @@ If graph store mode is set up for trainer or inferencer in one config, it must be set up in the other. """ -from typing import Literal +import re +from typing import Final, Literal from google.protobuf.message import Message @@ -18,6 +19,12 @@ logger = Logger() +# Vertex AI Experiment IDs are MetadataStore Context IDs and must satisfy +# this regex. +_VERTEX_RESOURCE_ID_PATTERN: Final[re.Pattern[str]] = re.compile( + r"^[a-z0-9][a-z0-9-]{0,127}$" +) + def _gbml_config_has_graph_store( gbml_config_pb_wrapper: GbmlConfigPbWrapper, @@ -108,13 +115,21 @@ def check_vertex_ai_trainer_tensorboard_compatibility( ) -> None: """Check that Vertex AI trainer TensorBoard config is complete. + ``tensorboard_resource_name`` and ``tensorboard_experiment_name`` must be + supplied together (or both unset). The trainer's chief-rank uploader needs + both to call ``aiplatform.start_upload_tb_log``; setting only one + produces no observable behavior. + Args: gbml_config_pb_wrapper: The GbmlConfig wrapper. resource_config_wrapper: The GiglResourceConfig wrapper. Raises: - AssertionError: If TensorBoard logging is enabled for a Vertex AI - trainer but no TensorBoard resource name is configured. + AssertionError: If exactly one of ``tensorboard_resource_name`` / + ``tensorboard_experiment_name`` is set, or if + ``tensorboard_experiment_name`` doesn't satisfy the Vertex AI + resource-ID format, or if ``should_log_to_tensorboard`` is set + without both TB fields. """ logger.info( "Config validation check: Vertex AI trainer TensorBoard compatibility between template and resource configs." @@ -134,21 +149,33 @@ def check_vertex_ai_trainer_tensorboard_compatibility( else: return - if vertex_ai_config.tensorboard_experiment_name: - assert vertex_ai_config.tensorboard_resource_name, ( - "VertexAiResourceConfig.tensorboard_experiment_name is set " - f"({vertex_ai_config.tensorboard_experiment_name!r}) but no " - "Vertex AI TensorBoard resource is configured; the experiment " - "needs a backing TB resource." + has_resource_name = bool(vertex_ai_config.tensorboard_resource_name) + has_experiment_name = bool(vertex_ai_config.tensorboard_experiment_name) + if has_resource_name != has_experiment_name: + raise AssertionError( + "VertexAiResourceConfig.tensorboard_resource_name and " + "tensorboard_experiment_name must be set together. " + f"tensorboard_resource_name set: {has_resource_name}, " + f"tensorboard_experiment_name set: {has_experiment_name}." + ) + + if has_experiment_name and not _VERTEX_RESOURCE_ID_PATTERN.match( + vertex_ai_config.tensorboard_experiment_name + ): + raise AssertionError( + "VertexAiResourceConfig.tensorboard_experiment_name " + f"({vertex_ai_config.tensorboard_experiment_name!r}) is not a " + f"valid Vertex AI Experiment ID; it must match " + f"{_VERTEX_RESOURCE_ID_PATTERN.pattern}." ) if not gbml_config_pb_wrapper.trainer_config.should_log_to_tensorboard: return - assert vertex_ai_config.tensorboard_resource_name, ( + assert has_resource_name, ( "GbmlConfig.trainer_config.should_log_to_tensorboard is true, so a " - "Vertex AI TensorBoard resource name must be set in the trainer " - "resource config." + "Vertex AI TensorBoard resource name and experiment name must be " + "set in the trainer resource config." ) diff --git a/tests/unit/src/validation/lib/gbml_and_resource_config_compatibility_checks_test.py b/tests/unit/src/validation/lib/gbml_and_resource_config_compatibility_checks_test.py index 609c47676..716a8e270 100644 --- a/tests/unit/src/validation/lib/gbml_and_resource_config_compatibility_checks_test.py +++ b/tests/unit/src/validation/lib/gbml_and_resource_config_compatibility_checks_test.py @@ -284,7 +284,8 @@ def test_vertex_ai_trainer_tensorboard_config_present(self): resource_config = _create_resource_config_with_trainer_tensorboard( tensorboard_resource_name=( "projects/test-project/locations/us-central1/tensorboards/test" - ) + ), + tensorboard_experiment_name="my-comparison", ) check_vertex_ai_trainer_tensorboard_compatibility( @@ -298,6 +299,7 @@ def test_graph_store_trainer_tensorboard_config_present(self): tensorboard_resource_name=( "projects/test-project/locations/us-central1/tensorboards/test" ), + tensorboard_experiment_name="my-comparison", use_graph_store=True, ) @@ -316,6 +318,39 @@ def test_vertex_ai_trainer_tensorboard_missing_resource_name_raises(self): resource_config_wrapper=resource_config, ) + def test_resource_name_set_without_experiment_name_raises(self): + """tensorboard_resource_name set without tensorboard_experiment_name → AssertionError.""" + gbml_config = _create_empty_gbml_config() + resource_config = _create_resource_config_with_trainer_tensorboard( + tensorboard_resource_name=( + "projects/test-project/locations/us-central1/tensorboards/test" + ) + ) + + with self.assertRaises(AssertionError) as ctx: + check_vertex_ai_trainer_tensorboard_compatibility( + gbml_config_pb_wrapper=gbml_config, + resource_config_wrapper=resource_config, + ) + self.assertIn("must be set together", str(ctx.exception)) + + def test_invalid_experiment_name_format_raises(self): + """tensorboard_experiment_name that violates the Vertex resource-ID regex raises.""" + gbml_config = _create_empty_gbml_config() + resource_config = _create_resource_config_with_trainer_tensorboard( + tensorboard_resource_name=( + "projects/test-project/locations/us-central1/tensorboards/test" + ), + tensorboard_experiment_name="My_Invalid_Name", + ) + + with self.assertRaises(AssertionError) as ctx: + check_vertex_ai_trainer_tensorboard_compatibility( + gbml_config_pb_wrapper=gbml_config, + resource_config_wrapper=resource_config, + ) + self.assertIn("not a valid Vertex AI Experiment ID", str(ctx.exception)) + def test_resource_has_inferencer_graph_store_template_does_not(self): """Test that resource having graph store but template not raises an assertion error.""" gbml_config = _create_gbml_config_without_graph_stores() @@ -327,7 +362,7 @@ def test_resource_has_inferencer_graph_store_template_does_not(self): ) def test_experiment_name_set_without_tensorboard_resource_raises(self): - """tensorboard_experiment_name set but no TB resource → AssertionError mentioning the field.""" + """tensorboard_experiment_name set without resource_name → AssertionError.""" gbml_config = _create_empty_gbml_config() resource_config = _create_resource_config_with_experiment_name_only( experiment_name="my-comparison" @@ -338,7 +373,7 @@ def test_experiment_name_set_without_tensorboard_resource_raises(self): gbml_config_pb_wrapper=gbml_config, resource_config_wrapper=resource_config, ) - self.assertIn("tensorboard_experiment_name", str(ctx.exception)) + self.assertIn("must be set together", str(ctx.exception)) def test_experiment_name_set_with_tensorboard_resource_does_not_raise(self): """tensorboard_experiment_name set and TB resource present → no exception.""" From c723261b2770df9505f4e7ee360f93cb867849c6 Mon Sep 17 00:00:00 2001 From: kmontemayor Date: Wed, 6 May 2026 16:19:19 +0000 Subject: [PATCH 56/59] vertex_ai: drop submit(tensorboard=) and the dual-uploader design The chief-rank ``aiplatform.start_upload_tb_log`` uploader started in PR #603 already gives both R1 (live streaming) and R2 (multi-run comparison via a stable ``TensorboardExperiment``). Vertex's built-in auto-uploader (gated on ``submit(tensorboard=...)``) was a duplicate path that wrote to a separate per-job experiment named after the job ID. Dropping it collapses to a single uploader. Trade-off: the "Open TensorBoard" button on the Vertex AI job page no longer renders. The named-experiment URL is still logged via the existing trainer-side path; a follow-up step will surface it more prominently in trainer stdout. This drops: - ``tensorboard=...`` kwarg from ``job.submit(...)``. - ``VertexAiJobConfig.tensorboard_resource_name`` and ``tensorboard_experiment_name`` dataclass fields (they were carriers into ``_submit_job``; nothing reads them now). - The ``_VERTEX_RESOURCE_ID_PATTERN`` regex check in ``_submit_job`` (moved to validation in the previous commit). - The per-job and cross-job URL log lines in ``_submit_job``. - The launcher's ``tensorboard_resource_name=`` / ``tensorboard_experiment_name=`` kwargs to ``VertexAiJobConfig``. The ``GIGL_TENSORBOARD_*`` env-var injection block is unchanged (simplified guard since validation now enforces all-or-nothing). Co-Authored-By: Claude Opus 4.7 (1M context) --- gigl/common/services/vertex_ai.py | 71 ++----------------- gigl/src/common/vertex_ai_launcher.py | 34 +++------ .../src/common/vertex_ai_launcher_test.py | 65 +++-------------- 3 files changed, 26 insertions(+), 144 deletions(-) diff --git a/gigl/common/services/vertex_ai.py b/gigl/common/services/vertex_ai.py index 6d3d865ad..7350f5ec3 100644 --- a/gigl/common/services/vertex_ai.py +++ b/gigl/common/services/vertex_ai.py @@ -88,12 +88,6 @@ def get_pipeline() -> int: # NOTE: `get_pipeline` here is the Pipeline name DEFAULT_PIPELINE_TIMEOUT_S: Final[int] = 60 * 60 * 36 # 36 hours DEFAULT_CUSTOM_JOB_TIMEOUT_S: Final[int] = 60 * 60 * 24 # 24 hours -# Vertex AI Experiment IDs are MetadataStore Context IDs and must satisfy -# this regex. -_VERTEX_RESOURCE_ID_PATTERN: Final[re.Pattern[str]] = re.compile( - r"^[a-z0-9][a-z0-9-]{0,127}$" -) - # Captures the trailing tensorboard ID from a fully-qualified resource name. # Used only for building the human-readable TB UI URL. _VERTEX_TENSORBOARD_ID_FROM_RESOURCE_PATTERN: Final[re.Pattern[str]] = re.compile( @@ -179,18 +173,11 @@ class VertexAiJobConfig: AI default (no reservation). base_output_dir: Optional CustomJob base output directory. When set, Vertex AI derives ``AIP_MODEL_DIR``, ``AIP_CHECKPOINT_DIR``, and - ``AIP_TENSORBOARD_LOG_DIR`` from this directory. - tensorboard_resource_name: Optional existing Vertex AI TensorBoard - resource to attach to the job. - tensorboard_experiment_name: Optional Vertex AI ``TensorboardExperiment`` - name for cross-job comparison. When set, the launcher injects - ``GIGL_TENSORBOARD_*`` env vars into the worker container; the - trainer's chief rank then streams events to this experiment via - ``aiplatform.start_upload_tb_log`` *in addition to* Vertex's - built-in per-job auto-upload (which is gated on - ``tensorboard_resource_name`` and is what the "Open TensorBoard" - link on the VAI job page resolves to). Multiple jobs sharing this - name appear as comparable runs on a single TensorBoard page. + ``AIP_TENSORBOARD_LOG_DIR`` from this directory. Setting this is + how GiGL trainers learn where to write TensorBoard events; the + chief-rank uploader (started inside the trainer) is what streams + them to a Vertex AI ``TensorboardExperiment`` for cross-job + comparison. """ job_name: str @@ -210,8 +197,6 @@ class VertexAiJobConfig: scheduling_strategy: Optional[aiplatform.gapic.Scheduling.Strategy] = None reservation_affinity: Optional[ReservationAffinity] = None base_output_dir: Optional[str] = None - tensorboard_resource_name: Optional[str] = None - tensorboard_experiment_name: Optional[str] = None class VertexAIService: @@ -408,36 +393,11 @@ def _submit_job( staging_bucket=self._staging_bucket, base_output_dir=job_config.base_output_dir, ) - if job_config.tensorboard_experiment_name: - if not job_config.tensorboard_resource_name: - raise ValueError( - "tensorboard_experiment_name is set but tensorboard_resource_name " - "is not; the experiment needs a backing TB resource." - ) - if not _VERTEX_RESOURCE_ID_PATTERN.match( - job_config.tensorboard_experiment_name - ): - raise ValueError( - f"tensorboard_experiment_name {job_config.tensorboard_experiment_name!r} " - f"is not a valid Vertex AI Experiment ID; it must match " - f"{_VERTEX_RESOURCE_ID_PATTERN.pattern}." - ) - - # Always pass ``tensorboard=`` when a TB resource is - # configured so the Vertex AI job page shows an "Open TensorBoard" - # link to the auto-named per-job experiment. When - # ``tensorboard_experiment_name`` is also set, the launcher injects - # ``GIGL_TENSORBOARD_*`` env vars and the trainer's chief rank - # additionally streams events to the user-named experiment via - # ``aiplatform.start_upload_tb_log``. See - # https://cloud.google.com/vertex-ai/docs/experiments/tensorboard-training - # for Vertex's auto-uploader contract. job.submit( service_account=self._service_account, timeout=job_config.timeout_s, enable_web_access=job_config.enable_web_access, scheduling_strategy=job_config.scheduling_strategy, - tensorboard=job_config.tensorboard_resource_name or None, ) job.wait_for_resource_creation() logger.info(f"Created job: {job.resource_name}") @@ -447,27 +407,6 @@ def _submit_job( logger.info( f"See job logs at: https://console.cloud.google.com/ai/platform/locations/{self._location}/training/{job.name}?project={self._project}" ) - if job_config.tensorboard_resource_name: - # Per-job TensorboardExperiment: name == job's numeric ID, set by - # Vertex's auto-uploader. This is what the "Open TensorBoard" link - # on the VAI job page resolves to. - per_job_url = _build_tensorboard_experiment_url( - tensorboard_resource_name=job_config.tensorboard_resource_name, - experiment_id=job.name, - ) - if per_job_url: - logger.info(f"View TensorBoard (per-job): {per_job_url}") - if job_config.tensorboard_experiment_name: - comparison_url = _build_tensorboard_experiment_url( - tensorboard_resource_name=job_config.tensorboard_resource_name, - experiment_id=job_config.tensorboard_experiment_name, - ) - if comparison_url: - logger.info( - "View TensorBoard (cross-job comparison, " - f"experiment={job_config.tensorboard_experiment_name!r}): " - f"{comparison_url}" - ) job.wait_for_completion() return job diff --git a/gigl/src/common/vertex_ai_launcher.py b/gigl/src/common/vertex_ai_launcher.py index 944a41a6d..bc9734b55 100644 --- a/gigl/src/common/vertex_ai_launcher.py +++ b/gigl/src/common/vertex_ai_launcher.py @@ -313,9 +313,6 @@ def _build_job_config( Returns: VertexAiJobConfig: A configuration object ready to be used with VertexAIService.launch_job(). """ - tensorboard_experiment_name = ( - vertex_ai_resource_config.tensorboard_experiment_name or None - ) job_args = ( [ f"--job_name={job_name}", @@ -335,29 +332,24 @@ def _build_job_config( else None ) - # When the user opted into a stable Vertex AI TensorboardExperiment via - # ``tensorboard_experiment_name``, inject env vars into the worker so the - # chief-rank trainer can stream events directly to that experiment via - # ``aiplatform.start_upload_tb_log``. (Vertex's built-in auto-uploader - # still runs in parallel — see ``VertexAIService._submit_job`` — and - # writes to a per-job auto-named experiment so the "Open TensorBoard" - # link on the VAI job page resolves correctly.) + # When the user opted into a stable Vertex AI TensorboardExperiment, inject + # env vars into the worker so the chief-rank trainer can stream events + # directly to that experiment via ``aiplatform.start_upload_tb_log``. + # Validation guarantees ``tensorboard_resource_name`` and + # ``tensorboard_experiment_name`` are set together. # # ``GIGL_TENSORBOARD_RUN_NAME`` carries a launch-unique, sanitized run # name. The writer creates a subdirectory of ``AIP_TENSORBOARD_LOG_DIR`` # with this name; the SDK ``LogdirLoader`` then surfaces it as a distinct - # ``TensorboardRun`` in the named experiment, so two jobs sharing - # ``tensorboard_experiment_name`` show up as two runs (instead of merging - # into one ``default`` run). + # ``TensorboardRun`` in the named experiment, so two jobs sharing the + # experiment name show up as two runs (instead of merging into one + # ``default`` run). # # References: # https://cloud.google.com/vertex-ai/docs/experiments/tensorboard-overview # https://cloud.google.com/vertex-ai/docs/reference/rest/v1/CustomJobSpec container_env_vars = list(env_vars) - if ( - tensorboard_experiment_name - and vertex_ai_resource_config.tensorboard_resource_name - ): + if vertex_ai_resource_config.tensorboard_experiment_name: container_env_vars.extend( [ env_var.EnvVar( @@ -366,7 +358,7 @@ def _build_job_config( ), env_var.EnvVar( name="GIGL_TENSORBOARD_EXPERIMENT_NAME", - value=tensorboard_experiment_name, + value=vertex_ai_resource_config.tensorboard_experiment_name, ), env_var.EnvVar( name="GIGL_TENSORBOARD_RUN_NAME", @@ -404,12 +396,6 @@ def _build_job_config( vertex_ai_resource_config.reservation_affinity ), base_output_dir=base_output_dir, - tensorboard_resource_name=( - vertex_ai_resource_config.tensorboard_resource_name or None - if base_output_dir is not None - else None - ), - tensorboard_experiment_name=tensorboard_experiment_name, ) return job_config diff --git a/tests/unit/src/common/vertex_ai_launcher_test.py b/tests/unit/src/common/vertex_ai_launcher_test.py index 0a566b4c3..084fbb281 100644 --- a/tests/unit/src/common/vertex_ai_launcher_test.py +++ b/tests/unit/src/common/vertex_ai_launcher_test.py @@ -201,10 +201,6 @@ def test_launch_training_graph_store_cuda(self, mock_vertex_ai_service_class): compute_job_config.base_output_dir, "gs://test-perm-bucket/job-name/trainer", ) - self.assertEqual( - compute_job_config.tensorboard_resource_name, - compute_pool.tensorboard_resource_name, - ) # Verify storage pool config self.assertEqual(storage_job_config.machine_type, storage_pool.machine_type) @@ -215,7 +211,6 @@ def test_launch_training_graph_store_cuda(self, mock_vertex_ai_service_class): self.assertIsNotNone(storage_job_config.args) assert storage_job_config.args is not None # Type narrowing for mypy self.assertIsNone(storage_job_config.base_output_dir) - self.assertIsNone(storage_job_config.tensorboard_resource_name) # Verify environment variables compute_env_vars = { @@ -322,7 +317,6 @@ def test_launch_inference_single_pool_cpu(self, mock_vertex_ai_service_class): f"--output_path={process_runtime_args['output_path']}", job_config.args ) self.assertIsNone(job_config.base_output_dir) - self.assertIsNone(job_config.tensorboard_resource_name) # Verify resource labels expected_labels = { @@ -371,7 +365,8 @@ def test_launch_single_pool_job_reads_experiment_name_from_resource_config( mock_service_instance.launch_job.assert_called_once() call_args = mock_service_instance.launch_job.call_args job_config = call_args.kwargs["job_config"] - self.assertEqual(job_config.tensorboard_experiment_name, experiment_name) + env = {ev.name: ev.value for ev in job_config.environment_variables or []} + self.assertEqual(env.get("GIGL_TENSORBOARD_EXPERIMENT_NAME"), experiment_name) @patch("gigl.src.common.vertex_ai_launcher.VertexAIService") def test_launch_graph_store_job_reads_experiment_name_from_compute_pool( @@ -415,55 +410,17 @@ def test_launch_graph_store_job_reads_experiment_name_from_compute_pool( compute_job_config = call_args.kwargs["compute_pool_job_config"] storage_job_config = call_args.kwargs["storage_pool_job_config"] + compute_env = { + ev.name: ev.value + for ev in compute_job_config.environment_variables or [] + } + storage_env_names = { + ev.name for ev in storage_job_config.environment_variables or [] + } self.assertEqual( - compute_job_config.tensorboard_experiment_name, experiment_name - ) - self.assertIsNone(storage_job_config.tensorboard_experiment_name) - - def test_build_job_config_threads_experiment_name(self) -> None: - """tensorboard_experiment_name on the resource config flows to VertexAiJobConfig.""" - resource_config = gigl_resource_config_pb2.VertexAiResourceConfig( - machine_type="n1-standard-4", - gpu_type="ACCELERATOR_TYPE_UNSPECIFIED", - gpu_limit=0, - num_replicas=1, - tensorboard_resource_name="projects/p/locations/us/tensorboards/1", - tensorboard_experiment_name="my-comparison", - ) - cfg = _build_job_config( - job_name="job", - task_config_uri=Uri("gs://b/task.yaml"), - resource_config_uri=Uri("gs://b/resource.yaml"), - command_str="python -m gigl.src.training.v2.glt_trainer", - args={}, - use_cuda=False, - container_uri="gcr.io/p/img", - vertex_ai_resource_config=resource_config, - env_vars=[], - tensorboard_logs_uri=Uri("gs://b/run/logs/"), - ) - self.assertEqual(cfg.tensorboard_experiment_name, "my-comparison") - - def test_build_job_config_experiment_name_default(self) -> None: - """Test that tensorboard_experiment_name defaults to None/empty when not provided.""" - resource_config = gigl_resource_config_pb2.VertexAiResourceConfig( - machine_type="n1-standard-4", - gpu_type="ACCELERATOR_TYPE_UNSPECIFIED", - gpu_limit=0, - num_replicas=1, - ) - cfg = _build_job_config( - job_name="job", - task_config_uri=Uri("gs://b/task.yaml"), - resource_config_uri=Uri("gs://b/resource.yaml"), - command_str="python -m gigl.src.training.v2.glt_trainer", - args={}, - use_cuda=False, - container_uri="gcr.io/p/img", - vertex_ai_resource_config=resource_config, - env_vars=[], + compute_env.get("GIGL_TENSORBOARD_EXPERIMENT_NAME"), experiment_name ) - self.assertIsNone(cfg.tensorboard_experiment_name) + self.assertNotIn("GIGL_TENSORBOARD_EXPERIMENT_NAME", storage_env_names) def test_build_job_config_injects_gigl_tensorboard_env_vars(self) -> None: """When tensorboard_experiment_name is set with a TB resource, the From 43779eaddc7590d86b1d7c073c3f10b5cccabdab Mon Sep 17 00:00:00 2001 From: kmontemayor Date: Wed, 6 May 2026 16:24:53 +0000 Subject: [PATCH 57/59] tensorboard_writer: log named-experiment URL when uploader starts Compensates for losing the Vertex AI job-page "Open TensorBoard" button: the chief-rank uploader now logs the cross-job ``TensorboardExperiment`` URL on start so engineers can find the comparison page from trainer stdout. URL format inlined from the helper that previously lived in ``vertex_ai.py:_build_tensorboard_experiment_url``; three lines of duplication beats a shared module for this. Co-Authored-By: Claude Opus 4.7 (1M context) --- gigl/utils/tensorboard_writer.py | 18 +++++++++++ tests/unit/utils/tensorboard_writer_test.py | 35 +++++++++++++++++++++ 2 files changed, 53 insertions(+) diff --git a/gigl/utils/tensorboard_writer.py b/gigl/utils/tensorboard_writer.py index 5cc9350ad..ceb3e9719 100644 --- a/gigl/utils/tensorboard_writer.py +++ b/gigl/utils/tensorboard_writer.py @@ -7,6 +7,10 @@ import tensorflow as tf from google.cloud import aiplatform +from gigl.common.logger import Logger + +logger = Logger() + # Vertex AI sets this env var to ``/logs/`` (or # ``//logs/`` for HyperparameterTuningJob trials) # when ``CustomJobSpec.baseOutputDirectory`` is configured. GiGL's launcher @@ -225,4 +229,18 @@ def _maybe_start_uploader(*, parent_log_dir: str) -> bool: tensorboard_experiment_name=experiment_name, logdir=parent_log_dir, ) + # Log the TB UI URL so engineers can find the named experiment without + # the Vertex AI job page's "Open TensorBoard" button (which is no longer + # rendered now that GiGL doesn't pass ``submit(tensorboard=...)``). + experiment_url = ( + f"https://{match['location']}.tensorboard.googleusercontent.com/experiment/" + f"projects+{match['project']}" + f"+locations+{match['location']}" + f"+tensorboards+{match['tensorboard_id']}" + f"+experiments+{experiment_name}" + ) + logger.info( + f"View TensorBoard (cross-job comparison, experiment={experiment_name!r}): " + f"{experiment_url}" + ) return True diff --git a/tests/unit/utils/tensorboard_writer_test.py b/tests/unit/utils/tensorboard_writer_test.py index a71173549..01e120440 100644 --- a/tests/unit/utils/tensorboard_writer_test.py +++ b/tests/unit/utils/tensorboard_writer_test.py @@ -5,6 +5,7 @@ from absl.testing import absltest +from gigl.utils import tensorboard_writer as tensorboard_writer_module from gigl.utils.tensorboard_writer import TensorBoardWriter from tests.test_assets.test_case import TestCase @@ -221,6 +222,40 @@ def test_uploader_skipped_for_disabled_writer(self) -> None: mock_start.assert_not_called() + def test_uploader_logs_named_experiment_url_on_start(self) -> None: + """The named-experiment URL is logged so engineers can find the TB + page without the (now-absent) Vertex AI job-page button. + """ + with patch.dict( + os.environ, + { + "AIP_TENSORBOARD_LOG_DIR": self._LOG_DIR, + "GIGL_TENSORBOARD_RESOURCE_NAME": self._TB_RESOURCE, + "GIGL_TENSORBOARD_EXPERIMENT_NAME": self._EXPERIMENT, + }, + clear=True, + ): + with patch("gigl.utils.tensorboard_writer.tf.summary.create_file_writer"): + with ( + patch("google.cloud.aiplatform.start_upload_tb_log"), + patch("google.cloud.aiplatform.init"), + patch("google.cloud.aiplatform.end_upload_tb_log"), + patch.object( + tensorboard_writer_module.logger, "info" + ) as mock_info, + ): + writer = TensorBoardWriter.from_env() + writer.close() + + info_calls = [call.args[0] for call in mock_info.call_args_list] + url_log = next( + (msg for msg in info_calls if "View TensorBoard" in msg), None + ) + self.assertIsNotNone(url_log) + self.assertIn(self._EXPERIMENT, url_log) + self.assertIn("tensorboards+42", url_log) + self.assertIn("us-central1", url_log) + def test_uploader_failure_after_writer_construction_closes_writer(self) -> None: """If start_upload_tb_log raises, the TF file writer is closed and the exception propagates — no leaked uploader thread, no half-built From 379021006ee69359b411c0a75e63310102688d3f Mon Sep 17 00:00:00 2001 From: kmontemayor Date: Wed, 6 May 2026 16:30:49 +0000 Subject: [PATCH 58/59] chore: ruff format Co-Authored-By: Claude Opus 4.7 (1M context) --- tests/unit/src/common/vertex_ai_launcher_test.py | 3 +-- tests/unit/utils/tensorboard_writer_test.py | 8 ++------ 2 files changed, 3 insertions(+), 8 deletions(-) diff --git a/tests/unit/src/common/vertex_ai_launcher_test.py b/tests/unit/src/common/vertex_ai_launcher_test.py index 084fbb281..b566782de 100644 --- a/tests/unit/src/common/vertex_ai_launcher_test.py +++ b/tests/unit/src/common/vertex_ai_launcher_test.py @@ -411,8 +411,7 @@ def test_launch_graph_store_job_reads_experiment_name_from_compute_pool( storage_job_config = call_args.kwargs["storage_pool_job_config"] compute_env = { - ev.name: ev.value - for ev in compute_job_config.environment_variables or [] + ev.name: ev.value for ev in compute_job_config.environment_variables or [] } storage_env_names = { ev.name for ev in storage_job_config.environment_variables or [] diff --git a/tests/unit/utils/tensorboard_writer_test.py b/tests/unit/utils/tensorboard_writer_test.py index 01e120440..a95140692 100644 --- a/tests/unit/utils/tensorboard_writer_test.py +++ b/tests/unit/utils/tensorboard_writer_test.py @@ -240,17 +240,13 @@ def test_uploader_logs_named_experiment_url_on_start(self) -> None: patch("google.cloud.aiplatform.start_upload_tb_log"), patch("google.cloud.aiplatform.init"), patch("google.cloud.aiplatform.end_upload_tb_log"), - patch.object( - tensorboard_writer_module.logger, "info" - ) as mock_info, + patch.object(tensorboard_writer_module.logger, "info") as mock_info, ): writer = TensorBoardWriter.from_env() writer.close() info_calls = [call.args[0] for call in mock_info.call_args_list] - url_log = next( - (msg for msg in info_calls if "View TensorBoard" in msg), None - ) + url_log = next((msg for msg in info_calls if "View TensorBoard" in msg), None) self.assertIsNotNone(url_log) self.assertIn(self._EXPERIMENT, url_log) self.assertIn("tensorboards+42", url_log) From ff250c1df48c48a7235ece12fdc086a81d4503cb Mon Sep 17 00:00:00 2001 From: kmontemayor Date: Wed, 6 May 2026 16:48:56 +0000 Subject: [PATCH 59/59] launcher: log named-experiment TB URL at submit time The chief-rank uploader inside the trainer container also logs this URL (commit 43779ead), but that surfaces only in Vertex AI job logs, which take a minute or two to materialize. Logging from the launcher puts the URL in local stdout immediately at submit time. Co-Authored-By: Claude Opus 4.7 (1M context) --- gigl/src/common/vertex_ai_launcher.py | 41 +++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/gigl/src/common/vertex_ai_launcher.py b/gigl/src/common/vertex_ai_launcher.py index bc9734b55..032ab9ff0 100644 --- a/gigl/src/common/vertex_ai_launcher.py +++ b/gigl/src/common/vertex_ai_launcher.py @@ -51,6 +51,44 @@ r"[^a-zA-Z0-9\n-]" ) +# Captures the project/location/tensorboard_id pieces of a fully-qualified +# Vertex AI TensorBoard resource name. Used to build the TensorBoard UI URL. +_TENSORBOARD_RESOURCE_NAME_PATTERN: Final[re.Pattern[str]] = re.compile( + r"^projects/(?P[^/]+)" + r"/locations/(?P[^/]+)" + r"/tensorboards/(?P[^/]+)$" +) + + +def _maybe_log_tensorboard_url( + vertex_ai_resource_config: VertexAiResourceConfig, +) -> None: + """Log the cross-job TensorBoard UI URL when the experiment is configured. + + The chief-rank uploader inside the trainer container also logs this URL, + but that only surfaces in Vertex AI job logs (which take a minute to + materialize). Logging it here means the URL appears in the launcher's + local stdout immediately at submit time. + """ + tb_resource = vertex_ai_resource_config.tensorboard_resource_name + experiment_name = vertex_ai_resource_config.tensorboard_experiment_name + if not tb_resource or not experiment_name: + return + match = _TENSORBOARD_RESOURCE_NAME_PATTERN.match(tb_resource) + if not match: + return + url = ( + f"https://{match['location']}.tensorboard.googleusercontent.com/experiment/" + f"projects+{match['project']}" + f"+locations+{match['location']}" + f"+tensorboards+{match['tensorboard_id']}" + f"+experiments+{experiment_name}" + ) + logger.info( + f"View TensorBoard (cross-job comparison, experiment={experiment_name!r}): " + f"{url}" + ) + def _sanitize_for_vertex_run(value: str) -> str: """Coerce ``value`` into the SDK's TensorboardRun-name character class. @@ -138,6 +176,7 @@ def launch_single_pool_job( tensorboard_logs_uri=tensorboard_logs_uri, ) logger.info(f"Launching {component.value} job with config: {job_config}") + _maybe_log_tensorboard_url(vertex_ai_resource_config) vertex_ai_service = VertexAIService( project=resource_config_wrapper.project, @@ -259,6 +298,8 @@ def launch_graph_store_enabled_job( else resource_config_wrapper.region ) + _maybe_log_tensorboard_url(compute_pool_config) + vertex_ai_service = VertexAIService( project=resource_config_wrapper.project, location=region,