From 321569fbe57d1d574b96119c2494ed3258dfaf1a Mon Sep 17 00:00:00 2001
From: Bob <uncoolbob@gmail.com>
Date: Tue, 3 Mar 2020 14:44:43 +0000
Subject: [PATCH 1/8] making a start on popbio branch

---
 Model/data/documentTypeCategories.json | 5 +++++
 Model/data/nonWdkDocumentFields.json   | 8 ++++++++
 2 files changed, 13 insertions(+)

diff --git a/Model/data/documentTypeCategories.json b/Model/data/documentTypeCategories.json
index 5b52223..ff9bcf0 100644
--- a/Model/data/documentTypeCategories.json
+++ b/Model/data/documentTypeCategories.json
@@ -43,6 +43,11 @@
         "displayNamePlural": "Popset isolate sequences",
         "wdkSearchUrlName": "PopsetIsolatesByText",
         "hasOrganismField": false
+      },
+      { "id": "popbioSample",
+	"displayName": "Field sample",
+	"displayNamePlural": "Field samples",
+	"hasOrganismField": false
       }
     ]
   },
diff --git a/Model/data/nonWdkDocumentFields.json b/Model/data/nonWdkDocumentFields.json
index 8904a73..a093564 100644
--- a/Model/data/nonWdkDocumentFields.json
+++ b/Model/data/nonWdkDocumentFields.json
@@ -21,5 +21,13 @@
      "boost": 1
     }
   ]
+ },
+ { "document-type": "popbioSample",
+   "fields": [
+     {"name":"TEXT__popbio_species",
+      "isSummary": false,
+      "boost": 1
+     }
+   ]
  }
 ]

From c86e54cf4600bee6771592094dddc5d1274ff351 Mon Sep 17 00:00:00 2001
From: Bob <uncoolbob@gmail.com>
Date: Thu, 12 Mar 2020 14:09:11 +0000
Subject: [PATCH 2/8] first complete draft

---
 Model/bin/interimMapVEuCSVtoSolr       | 124 +++++++++++++++++++++++++
 Model/data/documentTypeCategories.json |   2 +-
 Model/data/nonWdkDocumentFields.json   |  43 ++++++++-
 3 files changed, 167 insertions(+), 2 deletions(-)
 create mode 100755 Model/bin/interimMapVEuCSVtoSolr

diff --git a/Model/bin/interimMapVEuCSVtoSolr b/Model/bin/interimMapVEuCSVtoSolr
new file mode 100755
index 0000000..4f0c8bc
--- /dev/null
+++ b/Model/bin/interimMapVEuCSVtoSolr
@@ -0,0 +1,124 @@
+#!/usr/bin/env perl
+#  -*- mode: cperl -*-
+
+#
+# usage:
+#
+# 1. go to popbio map in sample view and export ALL samples as CSV
+# 2. ./interimMapVEuCSVtoSolr export.csv
+# 3. that will generate a directory called ./solr-json-batch_BATCHID
+#    which contains all JSON needed for indexing
+#
+#
+
+use strict;
+use warnings;
+use Text::CSV_XS;
+use JSON;
+use utf8::all;
+
+my ($file) = @ARGV;
+
+die "Must provide MapVEu CSV file as argument\n" unless ($file && -s $file);
+
+my $batch_name = "popbio";
+my $batch_type = "samples";
+my $batch_timestamp = time();
+my $batch_id = sprintf "%s_%s_%d", $batch_type, $batch_name, $batch_timestamp;
+my $document_type = "popbio-sample";
+
+my $output_dir = "solr-json-batch_${batch_id}";
+mkdir $output_dir || die;
+
+my $json = JSON->new; # ->pretty;
+
+# output batch info JSON
+my $batch_info = [
+		  {
+		   "batch-type" => $batch_type,
+		   "batch-name" => $batch_name,
+		   "document-type" => "batch-meta",
+		   "batch-timestamp" => $batch_timestamp,
+		   "batch-id" => $batch_id,
+		   "id" => $batch_id,
+		  }
+		 ];
+
+if (open(my $batch_info_fh, ">$output_dir/batch.json")) {
+  print $batch_info_fh $json->encode($batch_info);
+  close($batch_info_fh);
+} else {
+  die "couldn't write $output_dir/batch.json\n";
+}
+
+
+my $csv = Text::CSV_XS->new ({ binary => 1, auto_diag => 1 });
+open my $fh, "<:encoding(utf8)", $file or die "$file: $!";
+
+my $headers = $csv->getline($fh);
+
+# header to index
+my %h2i;
+for (my $i=0; $i<@$headers; $i++) { $h2i{$headers->[$i]}=$i }
+
+
+my %SolrField2function =
+  (
+   # Solr_Field_name => function that takes a $row object
+
+   "id" => sub { $document_type."_".$_[0]->[$h2i{"Sample ID"}] },
+   "documentType" => sub { $document_type },
+
+   "batch-id" => sub { $batch_id },
+   "batch-type" => sub { $batch_type },
+   "batch-timestamp" => sub { $batch_timestamp },
+   "batch-name" => sub { $batch_name },
+
+   "TEXT__popbio_species" => sub { $_[0]->[$h2i{"Species"}] },
+   "TEXT__popbio_sample_id" => sub { $_[0]->[$h2i{"Sample ID"}] },
+   "TEXT__popbio_sample_name" => sub { $_[0]->[$h2i{"Label"}] },
+   "TEXT__popbio_collection_id" => sub { $_[0]->[$h2i{"Collection ID"}] },
+
+   ### the place name is not particularly useful because in the map CSV export only the GADM-assigned ADM2 name is
+   ### available and it's not usually a placename that anyone would search for (like the country, adm1 or perhaps
+   ### the actual location (e.g. village) collected in)
+   ### also, this CSV field is not actually multi-valued, despite the pluralised name.
+   "TEXT__popbio_collection_location" => sub { $_[0]->[$h2i{"Locations"}] },
+
+   "TEXT__popbio_computed_description" => sub { sprintf "Sample %s collected from %s on %s",
+						  $_[0]->[$h2i{"Label"}],
+						  $_[0]->[$h2i{"Locations"}],
+						  $_[0]->[$h2i{"Collection date range"}]
+					      },
+
+   "MULTITEXT__popbio_collection_protocols" => sub { [ split /,/, $_[0]->[$h2i{"Collection protocols"}] ] },
+
+   ### project IDs should end up being single valued after maybe more curation
+   ### but for now multi-valued is safer
+   "MULTITEXT__popbio_project_ids" => sub { [ split /,/, $_[0]->[$h2i{"Projects"}] ] }, 
+
+   "MULTITEXT__popbio_citations" => sub { [ split /,/, $_[0]->[$h2i{"Citations"}] ] },
+   "MULTITEXT__popbio_tags" => sub { [ split /,/, $_[0]->[$h2i{"Tag"}] ] },
+
+   ### not indexing because more suited to advanced/strategy search or in-map filtering:
+   ### sex, dev stage, attractants, sample type, available data types
+  );
+
+
+open(my $samples_fh, ">$output_dir/$batch_type.json") || die "can't open output json";
+
+my $count;
+print $samples_fh "[\n";
+while (my $row = $csv->getline($fh)) {
+  my $doc = { };
+  foreach my $SolrField (keys %SolrField2function) {
+    $doc->{$SolrField} = $SolrField2function{$SolrField}($row);
+  }
+  print $samples_fh ",\n" if ($count++);
+  print $samples_fh $json->encode($doc);
+  # printing each document at a time, to prevent unnecessary memory use
+}
+print $samples_fh "]\n";
+
+
+close($samples_fh);
diff --git a/Model/data/documentTypeCategories.json b/Model/data/documentTypeCategories.json
index ff9bcf0..62768bc 100644
--- a/Model/data/documentTypeCategories.json
+++ b/Model/data/documentTypeCategories.json
@@ -44,7 +44,7 @@
         "wdkSearchUrlName": "PopsetIsolatesByText",
         "hasOrganismField": false
       },
-      { "id": "popbioSample",
+      { "id": "popbio-sample",
 	"displayName": "Field sample",
 	"displayNamePlural": "Field samples",
 	"hasOrganismField": false
diff --git a/Model/data/nonWdkDocumentFields.json b/Model/data/nonWdkDocumentFields.json
index a093564..384e60e 100644
--- a/Model/data/nonWdkDocumentFields.json
+++ b/Model/data/nonWdkDocumentFields.json
@@ -22,10 +22,51 @@
     }
   ]
  },
- { "document-type": "popbioSample",
+ { "document-type": "popbio_sample",
    "fields": [
      {"name":"TEXT__popbio_species",
       "isSummary": false,
+      "displayName": "Species",
+      "boost": 1
+     },
+     {"name":"TEXT__popbio_sample_id",
+      "isSummary": false,
+      "displayName": "Sample ID",
+      "boost": 1
+     },
+     {"name":"TEXT__popbio_sample_name",
+      "isSummary": false,
+      "displayName": "Sample name",
+      "boost": 1
+     },
+     {"name":"TEXT__popbio_collection_id",
+      "isSummary": false,
+      "displayName": "Collection ID",
+      "boost": 1
+     },
+     {"name":"TEXT__popbio_collection_location",
+      "isSummary": false,
+      "displayName": "Collection location",
+      "boost": 1
+     },
+     {"name":"TEXT__popbio_computed_description",
+      "isSummary": true,
+      "displayName": "Description",
+      "boost": 1
+     },
+     {"name":"MULTITEXT__popbio_project_ids",
+      "isSummary": false,
+      "displayName": "Project ID",
+      "boost": 1
+     },
+     {"name":"MULTITEXT__popbio_citations",
+      "isSummary": false,
+      "displayName": "Citations",
+      "boost": 1
+     },
+     {"name":"MULTITEXT__popbio_tags",
+      "isSummary": false,
+      "displayName": "Tags",
       "boost": 1
      }
    ]

From 207a2b5fb165d38d5385ae33006ee23309226c76 Mon Sep 17 00:00:00 2001
From: Bob <uncoolbob@gmail.com>
Date: Wed, 18 Mar 2020 18:41:32 +0000
Subject: [PATCH 3/8] changes requested by Steve

---
 Model/bin/interimMapVEuCSVtoSolr     | 3 +++
 Model/data/nonWdkDocumentFields.json | 4 ++--
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/Model/bin/interimMapVEuCSVtoSolr b/Model/bin/interimMapVEuCSVtoSolr
index 4f0c8bc..a0e1c17 100755
--- a/Model/bin/interimMapVEuCSVtoSolr
+++ b/Model/bin/interimMapVEuCSVtoSolr
@@ -68,6 +68,9 @@ my %SolrField2function =
 
    "id" => sub { $document_type."_".$_[0]->[$h2i{"Sample ID"}] },
    "documentType" => sub { $document_type },
+   "project" => sub { 'VectorBase' },
+   "primaryKey" => sub { $_[0]->[$h2i{"Sample ID"}] },
+   "hyperlinkName" => sub { sprintf "See %s in MapVEu", $_[0]->[$h2i{"Sample ID"}] },
 
    "batch-id" => sub { $batch_id },
    "batch-type" => sub { $batch_type },
diff --git a/Model/data/nonWdkDocumentFields.json b/Model/data/nonWdkDocumentFields.json
index 384e60e..289f520 100644
--- a/Model/data/nonWdkDocumentFields.json
+++ b/Model/data/nonWdkDocumentFields.json
@@ -25,12 +25,12 @@
  { "document-type": "popbio_sample",
    "fields": [
      {"name":"TEXT__popbio_species",
-      "isSummary": false,
+      "isSummary": true,
       "displayName": "Species",
       "boost": 1
      },
      {"name":"TEXT__popbio_sample_id",
-      "isSummary": false,
+      "isSummary": true,
       "displayName": "Sample ID",
       "boost": 1
      },

From c99189b7ef940d6399d7fa91b4994ae70a42a728 Mon Sep 17 00:00:00 2001
From: Bob <uncoolbob@gmail.com>
Date: Thu, 19 Mar 2020 20:17:44 +0000
Subject: [PATCH 4/8] documentType to document-type field name

---
 Model/bin/interimMapVEuCSVtoSolr | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Model/bin/interimMapVEuCSVtoSolr b/Model/bin/interimMapVEuCSVtoSolr
index a0e1c17..b18ecc1 100755
--- a/Model/bin/interimMapVEuCSVtoSolr
+++ b/Model/bin/interimMapVEuCSVtoSolr
@@ -67,7 +67,7 @@ my %SolrField2function =
    # Solr_Field_name => function that takes a $row object
 
    "id" => sub { $document_type."_".$_[0]->[$h2i{"Sample ID"}] },
-   "documentType" => sub { $document_type },
+   "document-type" => sub { $document_type },
    "project" => sub { 'VectorBase' },
    "primaryKey" => sub { $_[0]->[$h2i{"Sample ID"}] },
    "hyperlinkName" => sub { sprintf "See %s in MapVEu", $_[0]->[$h2i{"Sample ID"}] },

From 02048e52cf349c861f010fc366f750261c4758f4 Mon Sep 17 00:00:00 2001
From: Bob <uncoolbob@gmail.com>
Date: Fri, 20 Mar 2020 10:54:01 +0000
Subject: [PATCH 5/8] underscore to dash

---
 Model/data/nonWdkDocumentFields.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Model/data/nonWdkDocumentFields.json b/Model/data/nonWdkDocumentFields.json
index 289f520..8d89bea 100644
--- a/Model/data/nonWdkDocumentFields.json
+++ b/Model/data/nonWdkDocumentFields.json
@@ -22,7 +22,7 @@
     }
   ]
  },
- { "document-type": "popbio_sample",
+ { "document-type": "popbio-sample",
    "fields": [
      {"name":"TEXT__popbio_species",
       "isSummary": true,

From 7176125d458ca257ef2d9d6ea887c8e072d79562 Mon Sep 17 00:00:00 2001
From: Bob <uncoolbob@gmail.com>
Date: Fri, 20 Mar 2020 14:25:04 +0000
Subject: [PATCH 6/8] no more hyperLinkName

---
 Model/bin/interimMapVEuCSVtoSolr | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Model/bin/interimMapVEuCSVtoSolr b/Model/bin/interimMapVEuCSVtoSolr
index b18ecc1..be0024c 100755
--- a/Model/bin/interimMapVEuCSVtoSolr
+++ b/Model/bin/interimMapVEuCSVtoSolr
@@ -70,7 +70,7 @@ my %SolrField2function =
    "document-type" => sub { $document_type },
    "project" => sub { 'VectorBase' },
    "primaryKey" => sub { $_[0]->[$h2i{"Sample ID"}] },
-   "hyperlinkName" => sub { sprintf "See %s in MapVEu", $_[0]->[$h2i{"Sample ID"}] },
+#   "hyperlinkName" => sub { sprintf "See %s in MapVEu", $_[0]->[$h2i{"Sample ID"}] },
 
    "batch-id" => sub { $batch_id },
    "batch-type" => sub { $batch_type },

From 312b25e655e26999228348019c675234437eab8c Mon Sep 17 00:00:00 2001
From: Bob <uncoolbob@gmail.com>
Date: Tue, 18 Jun 2024 22:59:43 +0100
Subject: [PATCH 7/8] stop-gap script for b68 hot fix

---
 Model/bin/interimMapVEuCSVtoSolr | 212 +++++++++++++++++++------------
 1 file changed, 133 insertions(+), 79 deletions(-)

diff --git a/Model/bin/interimMapVEuCSVtoSolr b/Model/bin/interimMapVEuCSVtoSolr
index be0024c..52b3669 100755
--- a/Model/bin/interimMapVEuCSVtoSolr
+++ b/Model/bin/interimMapVEuCSVtoSolr
@@ -2,12 +2,26 @@
 #  -*- mode: cperl -*-
 
 #
-# usage:
+# interim interim script for b68
+#
+# download from Megastudy MapVEu the following files
+#
+# ./interimMapVEuCSVtoSolr studies.txt collection_sites.txt collections.txt samples.txt
+#
+# make sure these columns are included:
+#
+#==> studies.txt <==
+#Study_ID	PubMed ID [OBI_0001617]	DOI [OBI_0002110]	PopBio Study ID [POPBIO_8000215]	Tags [POPBIO_8000214]	Institution [POPBIO_8000185]
+#
+#==> collection_sites.txt <==
+#Collection_site_ID	Study_ID	provider name for collection site [EUPATH_0000542]	town [POPBIO_8000015]	Administrative region, level 2 [ENVO_00000006]	Administrative region, level 1 [ENVO_00000005]	country [OBI_0001627]	continent [GAZ_00000013]
+#
+#==> collections.txt <==
+#Collection_ID	Collection_site_ID	Study_ID	protocol [OBI_0000272]	specimen collection date(s) (raw) [OBI_0001619]
+#
+#==> samples.txt <==
+#Sample_ID	Collection_ID	Collection_site_ID	Study_ID	species [OBI_0001909]
 #
-# 1. go to popbio map in sample view and export ALL samples as CSV
-# 2. ./interimMapVEuCSVtoSolr export.csv
-# 3. that will generate a directory called ./solr-json-batch_BATCHID
-#    which contains all JSON needed for indexing
 #
 #
 
@@ -17,9 +31,9 @@ use Text::CSV_XS;
 use JSON;
 use utf8::all;
 
-my ($file) = @ARGV;
+my ($studies_file, $collection_sites_file, $collections_file, $samples_file) = @ARGV;
 
-die "Must provide MapVEu CSV file as argument\n" unless ($file && -s $file);
+die "Must provide MapVEu files as arguments\n" unless ($studies_file && -s $studies_file && $collection_sites_file && -s $collection_sites_file && $collections_file && -s $collections_file && $samples_file && -s $samples_file);
 
 my $batch_name = "popbio";
 my $batch_type = "samples";
@@ -30,98 +44,138 @@ my $document_type = "popbio-sample";
 my $output_dir = "solr-json-batch_${batch_id}";
 mkdir $output_dir || die;
 
-my $json = JSON->new; # ->pretty;
+my $json = JSON->new;
 
 # output batch info JSON
 my $batch_info = [
-		  {
-		   "batch-type" => $batch_type,
-		   "batch-name" => $batch_name,
-		   "document-type" => "batch-meta",
-		   "batch-timestamp" => $batch_timestamp,
-		   "batch-id" => $batch_id,
-		   "id" => $batch_id,
-		  }
-		 ];
+    {
+        "batch-type" => $batch_type,
+        "batch-name" => $batch_name,
+        "document-type" => "batch-meta",
+        "batch-timestamp" => $batch_timestamp,
+        "batch-id" => $batch_id,
+        "id" => $batch_id,
+    }
+];
 
 if (open(my $batch_info_fh, ">$output_dir/batch.json")) {
-  print $batch_info_fh $json->encode($batch_info);
-  close($batch_info_fh);
+    print $batch_info_fh $json->encode($batch_info);
+    close($batch_info_fh);
 } else {
-  die "couldn't write $output_dir/batch.json\n";
+    die "couldn't write $output_dir/batch.json\n";
 }
 
+my $csv = Text::CSV_XS->new({ sep_char => "\t", allow_loose_quotes => 1, binary => 1, auto_diag => 1 });
 
-my $csv = Text::CSV_XS->new ({ binary => 1, auto_diag => 1 });
-open my $fh, "<:encoding(utf8)", $file or die "$file: $!";
+# Load studies.txt into a hash
+open my $studies_fh, "<:encoding(utf8)", $studies_file or die "$studies_file: $!";
+my $studies_headers = $csv->getline($studies_fh);
+my %studies;
+while (my $row = $csv->getline($studies_fh)) {
+    my %data;
+    @data{@$studies_headers} = @$row;
+    $studies{$data{"Study_ID"}} = \%data;
+}
+close($studies_fh);
+
+# Load collection_sites.txt into a hash
+open my $collection_sites_fh, "<:encoding(utf8)", $collection_sites_file or die "$collection_sites_file: $!";
+my $collection_sites_headers = $csv->getline($collection_sites_fh);
+my %collection_sites;
+while (my $row = $csv->getline($collection_sites_fh)) {
+    my %data;
+    @data{@$collection_sites_headers} = @$row;
+    $collection_sites{$data{"Collection_site_ID"}} = \%data;
+}
+close($collection_sites_fh);
+
+# Load collections.txt into a hash
+open my $collections_fh, "<:encoding(utf8)", $collections_file or die "$collections_file: $!";
+my $collections_headers = $csv->getline($collections_fh);
+my %collections;
+while (my $row = $csv->getline($collections_fh)) {
+    my %data;
+    @data{@$collections_headers} = @$row;
+    $collections{$data{"Collection_ID"}} = \%data;
+}
+close($collections_fh);
 
+# Open samples.txt and process it
+open my $fh, "<:encoding(utf8)", $samples_file or die "$samples_file: $!";
 my $headers = $csv->getline($fh);
 
 # header to index
 my %h2i;
-for (my $i=0; $i<@$headers; $i++) { $h2i{$headers->[$i]}=$i }
-
-
-my %SolrField2function =
-  (
-   # Solr_Field_name => function that takes a $row object
-
-   "id" => sub { $document_type."_".$_[0]->[$h2i{"Sample ID"}] },
-   "document-type" => sub { $document_type },
-   "project" => sub { 'VectorBase' },
-   "primaryKey" => sub { $_[0]->[$h2i{"Sample ID"}] },
-#   "hyperlinkName" => sub { sprintf "See %s in MapVEu", $_[0]->[$h2i{"Sample ID"}] },
-
-   "batch-id" => sub { $batch_id },
-   "batch-type" => sub { $batch_type },
-   "batch-timestamp" => sub { $batch_timestamp },
-   "batch-name" => sub { $batch_name },
-
-   "TEXT__popbio_species" => sub { $_[0]->[$h2i{"Species"}] },
-   "TEXT__popbio_sample_id" => sub { $_[0]->[$h2i{"Sample ID"}] },
-   "TEXT__popbio_sample_name" => sub { $_[0]->[$h2i{"Label"}] },
-   "TEXT__popbio_collection_id" => sub { $_[0]->[$h2i{"Collection ID"}] },
-
-   ### the place name is not particularly useful because in the map CSV export only the GADM-assigned ADM2 name is
-   ### available and it's not usually a placename that anyone would search for (like the country, adm1 or perhaps
-   ### the actual location (e.g. village) collected in)
-   ### also, this CSV field is not actually multi-valued, despite the pluralised name.
-   "TEXT__popbio_collection_location" => sub { $_[0]->[$h2i{"Locations"}] },
-
-   "TEXT__popbio_computed_description" => sub { sprintf "Sample %s collected from %s on %s",
-						  $_[0]->[$h2i{"Label"}],
-						  $_[0]->[$h2i{"Locations"}],
-						  $_[0]->[$h2i{"Collection date range"}]
-					      },
-
-   "MULTITEXT__popbio_collection_protocols" => sub { [ split /,/, $_[0]->[$h2i{"Collection protocols"}] ] },
-
-   ### project IDs should end up being single valued after maybe more curation
-   ### but for now multi-valued is safer
-   "MULTITEXT__popbio_project_ids" => sub { [ split /,/, $_[0]->[$h2i{"Projects"}] ] }, 
-
-   "MULTITEXT__popbio_citations" => sub { [ split /,/, $_[0]->[$h2i{"Citations"}] ] },
-   "MULTITEXT__popbio_tags" => sub { [ split /,/, $_[0]->[$h2i{"Tag"}] ] },
-
-   ### not indexing because more suited to advanced/strategy search or in-map filtering:
-   ### sex, dev stage, attractants, sample type, available data types
-  );
-
+for (my $i = 0; $i < @$headers; $i++) {
+    $h2i{$headers->[$i]} = $i;
+}
 
-open(my $samples_fh, ">$output_dir/$batch_type.json") || die "can't open output json";
+my %SolrField2function = (
+    "id" => sub { $document_type . "_" . $_[0]->[$h2i{"Sample_ID"}] },
+    "document-type" => sub { $document_type },
+    "project" => sub { 'VectorBase' },
+    "primaryKey" => sub { $_[0]->[$h2i{"Sample_ID"}] },
+
+    "batch-id" => sub { $batch_id },
+    "batch-type" => sub { $batch_type },
+    "batch-timestamp" => sub { $batch_timestamp },
+    "batch-name" => sub { $batch_name },
+
+    "TEXT__popbio_species" => sub { $_[0]->[$h2i{"species [OBI_0001909]"}] || '' },
+    "TEXT__popbio_sample_id" => sub { $_[0]->[$h2i{"Sample_ID"}] || '' },
+    "TEXT__popbio_sample_name" => sub { $_[0]->[$h2i{"Sample_ID"}] || '' },
+    "TEXT__popbio_collection_id" => sub { $_[0]->[$h2i{"Collection_ID"}] || '' },
+    
+    "TEXT__popbio_collection_location" => sub {
+        my $collection_site = $collection_sites{$_[0]->[$h2i{"Collection_site_ID"}]};
+        return $collection_site ? (grep { $_ } @$collection_site{("provider name for collection site [EUPATH_0000542]", "town [POPBIO_8000015]", "Administrative region, level 2 [ENVO_00000006]", "Administrative region, level 1 [ENVO_00000005]", "country [OBI_0001627]", "continent [GAZ_00000013]")})[0] : '';
+    },
+    
+    "TEXT__popbio_computed_description" => sub {
+        my $collection = $collections{$_[0]->[$h2i{"Collection_ID"}]};
+        my $collection_site = $collection_sites{$_[0]->[$h2i{"Collection_site_ID"}]};
+        return sprintf "Sample %s collected from %s on %s",
+            $_[0]->[$h2i{"Sample_ID"}] || '',
+            $collection_site ? (grep { $_ } @$collection_site{("provider name for collection site [EUPATH_0000542]", "town [POPBIO_8000015]", "Administrative region, level 2 [ENVO_00000006]", "Administrative region, level 1 [ENVO_00000005]", "country [OBI_0001627]", "continent [GAZ_00000013]")})[0] || '' : '',
+            $collection ? ($collection->{"specimen collection date(s) (raw) [OBI_0001619]"} || '') : '';
+      },
+    
+    "MULTITEXT__popbio_collection_protocols" => sub {
+        my $collection = $collections{$_[0]->[$h2i{"Collection_ID"}]};
+        return $collection && $collection->{"protocol [OBI_0000272]"} ? decode_json($collection->{"protocol [OBI_0000272]"}) : [];
+    },
+    
+    "MULTITEXT__popbio_project_ids" => sub {
+        my $study = $studies{$_[0]->[$h2i{"Study_ID"}]};
+        return $study && $study->{"PopBio Study ID [POPBIO_8000215]"} ? [$study->{"PopBio Study ID [POPBIO_8000215]"}] : [];
+    },
+    
+    "MULTITEXT__popbio_citations" => sub {
+        my $study = $studies{$_[0]->[$h2i{"Study_ID"}]};
+        return $study ? [grep { $_ } @{$study}{"PubMed ID [OBI_0001617]", "DOI [OBI_0002110]"}] : [];
+    },
+    
+    "MULTITEXT__popbio_tags" => sub {
+        my $study = $studies{$_[0]->[$h2i{"Study_ID"}]};
+        return $study ? [grep { $_ } ($study->{"Tags [POPBIO_8000214]"},
+				      $study->{"Institution [POPBIO_8000185]"} ? @{from_json($study->{"Institution [POPBIO_8000185]"}, { utf8 => 0 })} : undef
+				     )] : [];
+    }
+);
+
+open(my $samples_fh, ">:encoding(utf8)", "$output_dir/$batch_type.json") || die "can't open output json";
 
 my $count;
 print $samples_fh "[\n";
 while (my $row = $csv->getline($fh)) {
-  my $doc = { };
-  foreach my $SolrField (keys %SolrField2function) {
-    $doc->{$SolrField} = $SolrField2function{$SolrField}($row);
-  }
-  print $samples_fh ",\n" if ($count++);
-  print $samples_fh $json->encode($doc);
-  # printing each document at a time, to prevent unnecessary memory use
+    my $doc = {};
+    foreach my $SolrField (keys %SolrField2function) {
+        $doc->{$SolrField} = $SolrField2function{$SolrField}($row);
+    }
+    print $samples_fh ",\n" if ($count++);
+    print $samples_fh $json->encode($doc);
 }
 print $samples_fh "]\n";
 
-
 close($samples_fh);
+close($fh);

From 60b176f0de2b3bba59d36df2bc4c2b5871a1fa4c Mon Sep 17 00:00:00 2001
From: Bob <uncoolbob@gmail.com>
Date: Thu, 4 Jul 2024 12:57:43 +0100
Subject: [PATCH 8/8] chunked/batched output JSON because it's too big now

---
 Model/bin/interimMapVEuCSVtoSolr | 35 ++++++++++++++++++++++++--------
 1 file changed, 26 insertions(+), 9 deletions(-)

diff --git a/Model/bin/interimMapVEuCSVtoSolr b/Model/bin/interimMapVEuCSVtoSolr
index 52b3669..f65c259 100755
--- a/Model/bin/interimMapVEuCSVtoSolr
+++ b/Model/bin/interimMapVEuCSVtoSolr
@@ -41,6 +41,8 @@ my $batch_timestamp = time();
 my $batch_id = sprintf "%s_%s_%d", $batch_type, $batch_name, $batch_timestamp;
 my $document_type = "popbio-sample";
 
+my $chunk_size = 500000;
+
 my $output_dir = "solr-json-batch_${batch_id}";
 mkdir $output_dir || die;
 
@@ -101,8 +103,8 @@ while (my $row = $csv->getline($collections_fh)) {
 close($collections_fh);
 
 # Open samples.txt and process it
-open my $fh, "<:encoding(utf8)", $samples_file or die "$samples_file: $!";
-my $headers = $csv->getline($fh);
+open my $samples_fh, "<:encoding(utf8)", $samples_file or die "$samples_file: $!";
+my $headers = $csv->getline($samples_fh);
 
 # header to index
 my %h2i;
@@ -163,19 +165,34 @@ my %SolrField2function = (
     }
 );
 
-open(my $samples_fh, ">:encoding(utf8)", "$output_dir/$batch_type.json") || die "can't open output json";
+my $chunk_number = 1;
+my $json_output_fh;
 
 my $count;
-print $samples_fh "[\n";
-while (my $row = $csv->getline($fh)) {
+while (my $row = $csv->getline($samples_fh)) {
     my $doc = {};
     foreach my $SolrField (keys %SolrField2function) {
         $doc->{$SolrField} = $SolrField2function{$SolrField}($row);
     }
-    print $samples_fh ",\n" if ($count++);
-    print $samples_fh $json->encode($doc);
+
+    # if not yet opened or if filehandle closed at the end of a chunk, open a new file
+    if (!defined $json_output_fh || !defined(fileno($json_output_fh))) {
+      open($json_output_fh, ">:encoding(utf8)", "$output_dir/$batch_type.$chunk_number.json") || die "can't open output json";
+      print $json_output_fh "[\n";
+      $count = 0;
+    }
+
+    print $json_output_fh ",\n" if ($count++);
+    print $json_output_fh $json->encode($doc);
+
+    if ($count == $chunk_size) {
+      print $json_output_fh "\n]\n";
+      close($json_output_fh);
+      $chunk_number++;
+    }
 }
-print $samples_fh "]\n";
+print $json_output_fh "\n]\n";
 
+close($json_output_fh);
 close($samples_fh);
-close($fh);
+