From 321569fbe57d1d574b96119c2494ed3258dfaf1a Mon Sep 17 00:00:00 2001 From: Bob Date: Tue, 3 Mar 2020 14:44:43 +0000 Subject: [PATCH 1/8] making a start on popbio branch --- Model/data/documentTypeCategories.json | 5 +++++ Model/data/nonWdkDocumentFields.json | 8 ++++++++ 2 files changed, 13 insertions(+) diff --git a/Model/data/documentTypeCategories.json b/Model/data/documentTypeCategories.json index 5b52223..ff9bcf0 100644 --- a/Model/data/documentTypeCategories.json +++ b/Model/data/documentTypeCategories.json @@ -43,6 +43,11 @@ "displayNamePlural": "Popset isolate sequences", "wdkSearchUrlName": "PopsetIsolatesByText", "hasOrganismField": false + }, + { "id": "popbioSample", + "displayName": "Field sample", + "displayNamePlural": "Field samples", + "hasOrganismField": false } ] }, diff --git a/Model/data/nonWdkDocumentFields.json b/Model/data/nonWdkDocumentFields.json index 8904a73..a093564 100644 --- a/Model/data/nonWdkDocumentFields.json +++ b/Model/data/nonWdkDocumentFields.json @@ -21,5 +21,13 @@ "boost": 1 } ] + }, + { "document-type": "popbioSample", + "fields": [ + {"name":"TEXT__popbio_species", + "isSummary": false, + "boost": 1 + } + ] } ] From c86e54cf4600bee6771592094dddc5d1274ff351 Mon Sep 17 00:00:00 2001 From: Bob Date: Thu, 12 Mar 2020 14:09:11 +0000 Subject: [PATCH 2/8] first complete draft --- Model/bin/interimMapVEuCSVtoSolr | 124 +++++++++++++++++++++++++ Model/data/documentTypeCategories.json | 2 +- Model/data/nonWdkDocumentFields.json | 43 ++++++++- 3 files changed, 167 insertions(+), 2 deletions(-) create mode 100755 Model/bin/interimMapVEuCSVtoSolr diff --git a/Model/bin/interimMapVEuCSVtoSolr b/Model/bin/interimMapVEuCSVtoSolr new file mode 100755 index 0000000..4f0c8bc --- /dev/null +++ b/Model/bin/interimMapVEuCSVtoSolr @@ -0,0 +1,124 @@ +#!/usr/bin/env perl +# -*- mode: cperl -*- + +# +# usage: +# +# 1. go to popbio map in sample view and export ALL samples as CSV +# 2. ./interimMapVEuCSVtoSolr export.csv +# 3. that will generate a directory called ./solr-json-batch_BATCHID +# which contains all JSON needed for indexing +# +# + +use strict; +use warnings; +use Text::CSV_XS; +use JSON; +use utf8::all; + +my ($file) = @ARGV; + +die "Must provide MapVEu CSV file as argument\n" unless ($file && -s $file); + +my $batch_name = "popbio"; +my $batch_type = "samples"; +my $batch_timestamp = time(); +my $batch_id = sprintf "%s_%s_%d", $batch_type, $batch_name, $batch_timestamp; +my $document_type = "popbio-sample"; + +my $output_dir = "solr-json-batch_${batch_id}"; +mkdir $output_dir || die; + +my $json = JSON->new; # ->pretty; + +# output batch info JSON +my $batch_info = [ + { + "batch-type" => $batch_type, + "batch-name" => $batch_name, + "document-type" => "batch-meta", + "batch-timestamp" => $batch_timestamp, + "batch-id" => $batch_id, + "id" => $batch_id, + } + ]; + +if (open(my $batch_info_fh, ">$output_dir/batch.json")) { + print $batch_info_fh $json->encode($batch_info); + close($batch_info_fh); +} else { + die "couldn't write $output_dir/batch.json\n"; +} + + +my $csv = Text::CSV_XS->new ({ binary => 1, auto_diag => 1 }); +open my $fh, "<:encoding(utf8)", $file or die "$file: $!"; + +my $headers = $csv->getline($fh); + +# header to index +my %h2i; +for (my $i=0; $i<@$headers; $i++) { $h2i{$headers->[$i]}=$i } + + +my %SolrField2function = + ( + # Solr_Field_name => function that takes a $row object + + "id" => sub { $document_type."_".$_[0]->[$h2i{"Sample ID"}] }, + "documentType" => sub { $document_type }, + + "batch-id" => sub { $batch_id }, + "batch-type" => sub { $batch_type }, + "batch-timestamp" => sub { $batch_timestamp }, + "batch-name" => sub { $batch_name }, + + "TEXT__popbio_species" => sub { $_[0]->[$h2i{"Species"}] }, + "TEXT__popbio_sample_id" => sub { $_[0]->[$h2i{"Sample ID"}] }, + "TEXT__popbio_sample_name" => sub { $_[0]->[$h2i{"Label"}] }, + "TEXT__popbio_collection_id" => sub { $_[0]->[$h2i{"Collection ID"}] }, + + ### the place name is not particularly useful because in the map CSV export only the GADM-assigned ADM2 name is + ### available and it's not usually a placename that anyone would search for (like the country, adm1 or perhaps + ### the actual location (e.g. village) collected in) + ### also, this CSV field is not actually multi-valued, despite the pluralised name. + "TEXT__popbio_collection_location" => sub { $_[0]->[$h2i{"Locations"}] }, + + "TEXT__popbio_computed_description" => sub { sprintf "Sample %s collected from %s on %s", + $_[0]->[$h2i{"Label"}], + $_[0]->[$h2i{"Locations"}], + $_[0]->[$h2i{"Collection date range"}] + }, + + "MULTITEXT__popbio_collection_protocols" => sub { [ split /,/, $_[0]->[$h2i{"Collection protocols"}] ] }, + + ### project IDs should end up being single valued after maybe more curation + ### but for now multi-valued is safer + "MULTITEXT__popbio_project_ids" => sub { [ split /,/, $_[0]->[$h2i{"Projects"}] ] }, + + "MULTITEXT__popbio_citations" => sub { [ split /,/, $_[0]->[$h2i{"Citations"}] ] }, + "MULTITEXT__popbio_tags" => sub { [ split /,/, $_[0]->[$h2i{"Tag"}] ] }, + + ### not indexing because more suited to advanced/strategy search or in-map filtering: + ### sex, dev stage, attractants, sample type, available data types + ); + + +open(my $samples_fh, ">$output_dir/$batch_type.json") || die "can't open output json"; + +my $count; +print $samples_fh "[\n"; +while (my $row = $csv->getline($fh)) { + my $doc = { }; + foreach my $SolrField (keys %SolrField2function) { + $doc->{$SolrField} = $SolrField2function{$SolrField}($row); + } + print $samples_fh ",\n" if ($count++); + print $samples_fh $json->encode($doc); + # printing each document at a time, to prevent unnecessary memory use +} +print $samples_fh "]\n"; + + +close($samples_fh); diff --git a/Model/data/documentTypeCategories.json b/Model/data/documentTypeCategories.json index ff9bcf0..62768bc 100644 --- a/Model/data/documentTypeCategories.json +++ b/Model/data/documentTypeCategories.json @@ -44,7 +44,7 @@ "wdkSearchUrlName": "PopsetIsolatesByText", "hasOrganismField": false }, - { "id": "popbioSample", + { "id": "popbio-sample", "displayName": "Field sample", "displayNamePlural": "Field samples", "hasOrganismField": false diff --git a/Model/data/nonWdkDocumentFields.json b/Model/data/nonWdkDocumentFields.json index a093564..384e60e 100644 --- a/Model/data/nonWdkDocumentFields.json +++ b/Model/data/nonWdkDocumentFields.json @@ -22,10 +22,51 @@ } ] }, - { "document-type": "popbioSample", + { "document-type": "popbio_sample", "fields": [ {"name":"TEXT__popbio_species", "isSummary": false, + "displayName": "Species", + "boost": 1 + }, + {"name":"TEXT__popbio_sample_id", + "isSummary": false, + "displayName": "Sample ID", + "boost": 1 + }, + {"name":"TEXT__popbio_sample_name", + "isSummary": false, + "displayName": "Sample name", + "boost": 1 + }, + {"name":"TEXT__popbio_collection_id", + "isSummary": false, + "displayName": "Collection ID", + "boost": 1 + }, + {"name":"TEXT__popbio_collection_location", + "isSummary": false, + "displayName": "Collection location", + "boost": 1 + }, + {"name":"TEXT__popbio_computed_description", + "isSummary": true, + "displayName": "Description", + "boost": 1 + }, + {"name":"MULTITEXT__popbio_project_ids", + "isSummary": false, + "displayName": "Project ID", + "boost": 1 + }, + {"name":"MULTITEXT__popbio_citations", + "isSummary": false, + "displayName": "Citations", + "boost": 1 + }, + {"name":"MULTITEXT__popbio_tags", + "isSummary": false, + "displayName": "Tags", "boost": 1 } ] From 207a2b5fb165d38d5385ae33006ee23309226c76 Mon Sep 17 00:00:00 2001 From: Bob Date: Wed, 18 Mar 2020 18:41:32 +0000 Subject: [PATCH 3/8] changes requested by Steve --- Model/bin/interimMapVEuCSVtoSolr | 3 +++ Model/data/nonWdkDocumentFields.json | 4 ++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/Model/bin/interimMapVEuCSVtoSolr b/Model/bin/interimMapVEuCSVtoSolr index 4f0c8bc..a0e1c17 100755 --- a/Model/bin/interimMapVEuCSVtoSolr +++ b/Model/bin/interimMapVEuCSVtoSolr @@ -68,6 +68,9 @@ my %SolrField2function = "id" => sub { $document_type."_".$_[0]->[$h2i{"Sample ID"}] }, "documentType" => sub { $document_type }, + "project" => sub { 'VectorBase' }, + "primaryKey" => sub { $_[0]->[$h2i{"Sample ID"}] }, + "hyperlinkName" => sub { sprintf "See %s in MapVEu", $_[0]->[$h2i{"Sample ID"}] }, "batch-id" => sub { $batch_id }, "batch-type" => sub { $batch_type }, diff --git a/Model/data/nonWdkDocumentFields.json b/Model/data/nonWdkDocumentFields.json index 384e60e..289f520 100644 --- a/Model/data/nonWdkDocumentFields.json +++ b/Model/data/nonWdkDocumentFields.json @@ -25,12 +25,12 @@ { "document-type": "popbio_sample", "fields": [ {"name":"TEXT__popbio_species", - "isSummary": false, + "isSummary": true, "displayName": "Species", "boost": 1 }, {"name":"TEXT__popbio_sample_id", - "isSummary": false, + "isSummary": true, "displayName": "Sample ID", "boost": 1 }, From c99189b7ef940d6399d7fa91b4994ae70a42a728 Mon Sep 17 00:00:00 2001 From: Bob Date: Thu, 19 Mar 2020 20:17:44 +0000 Subject: [PATCH 4/8] documentType to document-type field name --- Model/bin/interimMapVEuCSVtoSolr | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Model/bin/interimMapVEuCSVtoSolr b/Model/bin/interimMapVEuCSVtoSolr index a0e1c17..b18ecc1 100755 --- a/Model/bin/interimMapVEuCSVtoSolr +++ b/Model/bin/interimMapVEuCSVtoSolr @@ -67,7 +67,7 @@ my %SolrField2function = # Solr_Field_name => function that takes a $row object "id" => sub { $document_type."_".$_[0]->[$h2i{"Sample ID"}] }, - "documentType" => sub { $document_type }, + "document-type" => sub { $document_type }, "project" => sub { 'VectorBase' }, "primaryKey" => sub { $_[0]->[$h2i{"Sample ID"}] }, "hyperlinkName" => sub { sprintf "See %s in MapVEu", $_[0]->[$h2i{"Sample ID"}] }, From 02048e52cf349c861f010fc366f750261c4758f4 Mon Sep 17 00:00:00 2001 From: Bob Date: Fri, 20 Mar 2020 10:54:01 +0000 Subject: [PATCH 5/8] underscore to dash --- Model/data/nonWdkDocumentFields.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Model/data/nonWdkDocumentFields.json b/Model/data/nonWdkDocumentFields.json index 289f520..8d89bea 100644 --- a/Model/data/nonWdkDocumentFields.json +++ b/Model/data/nonWdkDocumentFields.json @@ -22,7 +22,7 @@ } ] }, - { "document-type": "popbio_sample", + { "document-type": "popbio-sample", "fields": [ {"name":"TEXT__popbio_species", "isSummary": true, From 7176125d458ca257ef2d9d6ea887c8e072d79562 Mon Sep 17 00:00:00 2001 From: Bob Date: Fri, 20 Mar 2020 14:25:04 +0000 Subject: [PATCH 6/8] no more hyperLinkName --- Model/bin/interimMapVEuCSVtoSolr | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Model/bin/interimMapVEuCSVtoSolr b/Model/bin/interimMapVEuCSVtoSolr index b18ecc1..be0024c 100755 --- a/Model/bin/interimMapVEuCSVtoSolr +++ b/Model/bin/interimMapVEuCSVtoSolr @@ -70,7 +70,7 @@ my %SolrField2function = "document-type" => sub { $document_type }, "project" => sub { 'VectorBase' }, "primaryKey" => sub { $_[0]->[$h2i{"Sample ID"}] }, - "hyperlinkName" => sub { sprintf "See %s in MapVEu", $_[0]->[$h2i{"Sample ID"}] }, +# "hyperlinkName" => sub { sprintf "See %s in MapVEu", $_[0]->[$h2i{"Sample ID"}] }, "batch-id" => sub { $batch_id }, "batch-type" => sub { $batch_type }, From 312b25e655e26999228348019c675234437eab8c Mon Sep 17 00:00:00 2001 From: Bob Date: Tue, 18 Jun 2024 22:59:43 +0100 Subject: [PATCH 7/8] stop-gap script for b68 hot fix --- Model/bin/interimMapVEuCSVtoSolr | 212 +++++++++++++++++++------------ 1 file changed, 133 insertions(+), 79 deletions(-) diff --git a/Model/bin/interimMapVEuCSVtoSolr b/Model/bin/interimMapVEuCSVtoSolr index be0024c..52b3669 100755 --- a/Model/bin/interimMapVEuCSVtoSolr +++ b/Model/bin/interimMapVEuCSVtoSolr @@ -2,12 +2,26 @@ # -*- mode: cperl -*- # -# usage: +# interim interim script for b68 +# +# download from Megastudy MapVEu the following files +# +# ./interimMapVEuCSVtoSolr studies.txt collection_sites.txt collections.txt samples.txt +# +# make sure these columns are included: +# +#==> studies.txt <== +#Study_ID PubMed ID [OBI_0001617] DOI [OBI_0002110] PopBio Study ID [POPBIO_8000215] Tags [POPBIO_8000214] Institution [POPBIO_8000185] +# +#==> collection_sites.txt <== +#Collection_site_ID Study_ID provider name for collection site [EUPATH_0000542] town [POPBIO_8000015] Administrative region, level 2 [ENVO_00000006] Administrative region, level 1 [ENVO_00000005] country [OBI_0001627] continent [GAZ_00000013] +# +#==> collections.txt <== +#Collection_ID Collection_site_ID Study_ID protocol [OBI_0000272] specimen collection date(s) (raw) [OBI_0001619] +# +#==> samples.txt <== +#Sample_ID Collection_ID Collection_site_ID Study_ID species [OBI_0001909] # -# 1. go to popbio map in sample view and export ALL samples as CSV -# 2. ./interimMapVEuCSVtoSolr export.csv -# 3. that will generate a directory called ./solr-json-batch_BATCHID -# which contains all JSON needed for indexing # # @@ -17,9 +31,9 @@ use Text::CSV_XS; use JSON; use utf8::all; -my ($file) = @ARGV; +my ($studies_file, $collection_sites_file, $collections_file, $samples_file) = @ARGV; -die "Must provide MapVEu CSV file as argument\n" unless ($file && -s $file); +die "Must provide MapVEu files as arguments\n" unless ($studies_file && -s $studies_file && $collection_sites_file && -s $collection_sites_file && $collections_file && -s $collections_file && $samples_file && -s $samples_file); my $batch_name = "popbio"; my $batch_type = "samples"; @@ -30,98 +44,138 @@ my $document_type = "popbio-sample"; my $output_dir = "solr-json-batch_${batch_id}"; mkdir $output_dir || die; -my $json = JSON->new; # ->pretty; +my $json = JSON->new; # output batch info JSON my $batch_info = [ - { - "batch-type" => $batch_type, - "batch-name" => $batch_name, - "document-type" => "batch-meta", - "batch-timestamp" => $batch_timestamp, - "batch-id" => $batch_id, - "id" => $batch_id, - } - ]; + { + "batch-type" => $batch_type, + "batch-name" => $batch_name, + "document-type" => "batch-meta", + "batch-timestamp" => $batch_timestamp, + "batch-id" => $batch_id, + "id" => $batch_id, + } +]; if (open(my $batch_info_fh, ">$output_dir/batch.json")) { - print $batch_info_fh $json->encode($batch_info); - close($batch_info_fh); + print $batch_info_fh $json->encode($batch_info); + close($batch_info_fh); } else { - die "couldn't write $output_dir/batch.json\n"; + die "couldn't write $output_dir/batch.json\n"; } +my $csv = Text::CSV_XS->new({ sep_char => "\t", allow_loose_quotes => 1, binary => 1, auto_diag => 1 }); -my $csv = Text::CSV_XS->new ({ binary => 1, auto_diag => 1 }); -open my $fh, "<:encoding(utf8)", $file or die "$file: $!"; +# Load studies.txt into a hash +open my $studies_fh, "<:encoding(utf8)", $studies_file or die "$studies_file: $!"; +my $studies_headers = $csv->getline($studies_fh); +my %studies; +while (my $row = $csv->getline($studies_fh)) { + my %data; + @data{@$studies_headers} = @$row; + $studies{$data{"Study_ID"}} = \%data; +} +close($studies_fh); + +# Load collection_sites.txt into a hash +open my $collection_sites_fh, "<:encoding(utf8)", $collection_sites_file or die "$collection_sites_file: $!"; +my $collection_sites_headers = $csv->getline($collection_sites_fh); +my %collection_sites; +while (my $row = $csv->getline($collection_sites_fh)) { + my %data; + @data{@$collection_sites_headers} = @$row; + $collection_sites{$data{"Collection_site_ID"}} = \%data; +} +close($collection_sites_fh); + +# Load collections.txt into a hash +open my $collections_fh, "<:encoding(utf8)", $collections_file or die "$collections_file: $!"; +my $collections_headers = $csv->getline($collections_fh); +my %collections; +while (my $row = $csv->getline($collections_fh)) { + my %data; + @data{@$collections_headers} = @$row; + $collections{$data{"Collection_ID"}} = \%data; +} +close($collections_fh); +# Open samples.txt and process it +open my $fh, "<:encoding(utf8)", $samples_file or die "$samples_file: $!"; my $headers = $csv->getline($fh); # header to index my %h2i; -for (my $i=0; $i<@$headers; $i++) { $h2i{$headers->[$i]}=$i } - - -my %SolrField2function = - ( - # Solr_Field_name => function that takes a $row object - - "id" => sub { $document_type."_".$_[0]->[$h2i{"Sample ID"}] }, - "document-type" => sub { $document_type }, - "project" => sub { 'VectorBase' }, - "primaryKey" => sub { $_[0]->[$h2i{"Sample ID"}] }, -# "hyperlinkName" => sub { sprintf "See %s in MapVEu", $_[0]->[$h2i{"Sample ID"}] }, - - "batch-id" => sub { $batch_id }, - "batch-type" => sub { $batch_type }, - "batch-timestamp" => sub { $batch_timestamp }, - "batch-name" => sub { $batch_name }, - - "TEXT__popbio_species" => sub { $_[0]->[$h2i{"Species"}] }, - "TEXT__popbio_sample_id" => sub { $_[0]->[$h2i{"Sample ID"}] }, - "TEXT__popbio_sample_name" => sub { $_[0]->[$h2i{"Label"}] }, - "TEXT__popbio_collection_id" => sub { $_[0]->[$h2i{"Collection ID"}] }, - - ### the place name is not particularly useful because in the map CSV export only the GADM-assigned ADM2 name is - ### available and it's not usually a placename that anyone would search for (like the country, adm1 or perhaps - ### the actual location (e.g. village) collected in) - ### also, this CSV field is not actually multi-valued, despite the pluralised name. - "TEXT__popbio_collection_location" => sub { $_[0]->[$h2i{"Locations"}] }, - - "TEXT__popbio_computed_description" => sub { sprintf "Sample %s collected from %s on %s", - $_[0]->[$h2i{"Label"}], - $_[0]->[$h2i{"Locations"}], - $_[0]->[$h2i{"Collection date range"}] - }, - - "MULTITEXT__popbio_collection_protocols" => sub { [ split /,/, $_[0]->[$h2i{"Collection protocols"}] ] }, - - ### project IDs should end up being single valued after maybe more curation - ### but for now multi-valued is safer - "MULTITEXT__popbio_project_ids" => sub { [ split /,/, $_[0]->[$h2i{"Projects"}] ] }, - - "MULTITEXT__popbio_citations" => sub { [ split /,/, $_[0]->[$h2i{"Citations"}] ] }, - "MULTITEXT__popbio_tags" => sub { [ split /,/, $_[0]->[$h2i{"Tag"}] ] }, - - ### not indexing because more suited to advanced/strategy search or in-map filtering: - ### sex, dev stage, attractants, sample type, available data types - ); - +for (my $i = 0; $i < @$headers; $i++) { + $h2i{$headers->[$i]} = $i; +} -open(my $samples_fh, ">$output_dir/$batch_type.json") || die "can't open output json"; +my %SolrField2function = ( + "id" => sub { $document_type . "_" . $_[0]->[$h2i{"Sample_ID"}] }, + "document-type" => sub { $document_type }, + "project" => sub { 'VectorBase' }, + "primaryKey" => sub { $_[0]->[$h2i{"Sample_ID"}] }, + + "batch-id" => sub { $batch_id }, + "batch-type" => sub { $batch_type }, + "batch-timestamp" => sub { $batch_timestamp }, + "batch-name" => sub { $batch_name }, + + "TEXT__popbio_species" => sub { $_[0]->[$h2i{"species [OBI_0001909]"}] || '' }, + "TEXT__popbio_sample_id" => sub { $_[0]->[$h2i{"Sample_ID"}] || '' }, + "TEXT__popbio_sample_name" => sub { $_[0]->[$h2i{"Sample_ID"}] || '' }, + "TEXT__popbio_collection_id" => sub { $_[0]->[$h2i{"Collection_ID"}] || '' }, + + "TEXT__popbio_collection_location" => sub { + my $collection_site = $collection_sites{$_[0]->[$h2i{"Collection_site_ID"}]}; + return $collection_site ? (grep { $_ } @$collection_site{("provider name for collection site [EUPATH_0000542]", "town [POPBIO_8000015]", "Administrative region, level 2 [ENVO_00000006]", "Administrative region, level 1 [ENVO_00000005]", "country [OBI_0001627]", "continent [GAZ_00000013]")})[0] : ''; + }, + + "TEXT__popbio_computed_description" => sub { + my $collection = $collections{$_[0]->[$h2i{"Collection_ID"}]}; + my $collection_site = $collection_sites{$_[0]->[$h2i{"Collection_site_ID"}]}; + return sprintf "Sample %s collected from %s on %s", + $_[0]->[$h2i{"Sample_ID"}] || '', + $collection_site ? (grep { $_ } @$collection_site{("provider name for collection site [EUPATH_0000542]", "town [POPBIO_8000015]", "Administrative region, level 2 [ENVO_00000006]", "Administrative region, level 1 [ENVO_00000005]", "country [OBI_0001627]", "continent [GAZ_00000013]")})[0] || '' : '', + $collection ? ($collection->{"specimen collection date(s) (raw) [OBI_0001619]"} || '') : ''; + }, + + "MULTITEXT__popbio_collection_protocols" => sub { + my $collection = $collections{$_[0]->[$h2i{"Collection_ID"}]}; + return $collection && $collection->{"protocol [OBI_0000272]"} ? decode_json($collection->{"protocol [OBI_0000272]"}) : []; + }, + + "MULTITEXT__popbio_project_ids" => sub { + my $study = $studies{$_[0]->[$h2i{"Study_ID"}]}; + return $study && $study->{"PopBio Study ID [POPBIO_8000215]"} ? [$study->{"PopBio Study ID [POPBIO_8000215]"}] : []; + }, + + "MULTITEXT__popbio_citations" => sub { + my $study = $studies{$_[0]->[$h2i{"Study_ID"}]}; + return $study ? [grep { $_ } @{$study}{"PubMed ID [OBI_0001617]", "DOI [OBI_0002110]"}] : []; + }, + + "MULTITEXT__popbio_tags" => sub { + my $study = $studies{$_[0]->[$h2i{"Study_ID"}]}; + return $study ? [grep { $_ } ($study->{"Tags [POPBIO_8000214]"}, + $study->{"Institution [POPBIO_8000185]"} ? @{from_json($study->{"Institution [POPBIO_8000185]"}, { utf8 => 0 })} : undef + )] : []; + } +); + +open(my $samples_fh, ">:encoding(utf8)", "$output_dir/$batch_type.json") || die "can't open output json"; my $count; print $samples_fh "[\n"; while (my $row = $csv->getline($fh)) { - my $doc = { }; - foreach my $SolrField (keys %SolrField2function) { - $doc->{$SolrField} = $SolrField2function{$SolrField}($row); - } - print $samples_fh ",\n" if ($count++); - print $samples_fh $json->encode($doc); - # printing each document at a time, to prevent unnecessary memory use + my $doc = {}; + foreach my $SolrField (keys %SolrField2function) { + $doc->{$SolrField} = $SolrField2function{$SolrField}($row); + } + print $samples_fh ",\n" if ($count++); + print $samples_fh $json->encode($doc); } print $samples_fh "]\n"; - close($samples_fh); +close($fh); From 60b176f0de2b3bba59d36df2bc4c2b5871a1fa4c Mon Sep 17 00:00:00 2001 From: Bob Date: Thu, 4 Jul 2024 12:57:43 +0100 Subject: [PATCH 8/8] chunked/batched output JSON because it's too big now --- Model/bin/interimMapVEuCSVtoSolr | 35 ++++++++++++++++++++++++-------- 1 file changed, 26 insertions(+), 9 deletions(-) diff --git a/Model/bin/interimMapVEuCSVtoSolr b/Model/bin/interimMapVEuCSVtoSolr index 52b3669..f65c259 100755 --- a/Model/bin/interimMapVEuCSVtoSolr +++ b/Model/bin/interimMapVEuCSVtoSolr @@ -41,6 +41,8 @@ my $batch_timestamp = time(); my $batch_id = sprintf "%s_%s_%d", $batch_type, $batch_name, $batch_timestamp; my $document_type = "popbio-sample"; +my $chunk_size = 500000; + my $output_dir = "solr-json-batch_${batch_id}"; mkdir $output_dir || die; @@ -101,8 +103,8 @@ while (my $row = $csv->getline($collections_fh)) { close($collections_fh); # Open samples.txt and process it -open my $fh, "<:encoding(utf8)", $samples_file or die "$samples_file: $!"; -my $headers = $csv->getline($fh); +open my $samples_fh, "<:encoding(utf8)", $samples_file or die "$samples_file: $!"; +my $headers = $csv->getline($samples_fh); # header to index my %h2i; @@ -163,19 +165,34 @@ my %SolrField2function = ( } ); -open(my $samples_fh, ">:encoding(utf8)", "$output_dir/$batch_type.json") || die "can't open output json"; +my $chunk_number = 1; +my $json_output_fh; my $count; -print $samples_fh "[\n"; -while (my $row = $csv->getline($fh)) { +while (my $row = $csv->getline($samples_fh)) { my $doc = {}; foreach my $SolrField (keys %SolrField2function) { $doc->{$SolrField} = $SolrField2function{$SolrField}($row); } - print $samples_fh ",\n" if ($count++); - print $samples_fh $json->encode($doc); + + # if not yet opened or if filehandle closed at the end of a chunk, open a new file + if (!defined $json_output_fh || !defined(fileno($json_output_fh))) { + open($json_output_fh, ">:encoding(utf8)", "$output_dir/$batch_type.$chunk_number.json") || die "can't open output json"; + print $json_output_fh "[\n"; + $count = 0; + } + + print $json_output_fh ",\n" if ($count++); + print $json_output_fh $json->encode($doc); + + if ($count == $chunk_size) { + print $json_output_fh "\n]\n"; + close($json_output_fh); + $chunk_number++; + } } -print $samples_fh "]\n"; +print $json_output_fh "\n]\n"; +close($json_output_fh); close($samples_fh); -close($fh); +