From 5337f9b1e4a8f2074ac6d6a57a463e95a1ec8aee Mon Sep 17 00:00:00 2001 From: Jelmer van der Linde Date: Mon, 14 Jun 2021 17:09:20 +0100 Subject: [PATCH] Change split_single_document to work on STDIN & STDOUT In b64 mode it still buffers the output and encodes it in one go. --- moses/ems/support/split-sentences.perl | 27 +++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/moses/ems/support/split-sentences.perl b/moses/ems/support/split-sentences.perl index ae510ce..bae7953 100755 --- a/moses/ems/support/split-sentences.perl +++ b/moses/ems/support/split-sentences.perl @@ -96,12 +96,16 @@ if ($MODE eq "base64documents") { while () { my $line = decode_base64($_); - open(my $fh, ":encoding(UTF-8)", \$out) or die $!; + &split_single_document; + } + print(encode_base64($out, "") . "\n"); } } else { - print &split_single_document(*STDIN); + &split_single_document; } @@ -109,18 +113,16 @@ sub split_single_document { # Argument is an open file handle. Lines will be merged unless a line with # just

or similar tag or a blank line. Or unless $KEEP_LINES # is True. - my ($fh) = @_; my $text = ""; - my $out = ""; # Loop over text, add lines together until we get a blank line or a

- while (<$fh>) { + while () { chomp; if ($KEEP_LINES) { - $out .= &split_block($_,""); + print &split_block($_,""); } elsif (/^<.+>$/ || /^\s*$/) { # Time to process this block; we've hit a blank or

- $out .= &split_block($text, $_); - $out .= "

\n" if $NOP == 0 && (/^\s*$/ && $text); ## If we have text followed by

+ print &split_block($text, $_); + print "

\n" if $NOP == 0 && (/^\s*$/ && $text); ## If we have text followed by

$text = ""; } else { # Append the text, with a space. @@ -128,8 +130,7 @@ sub split_single_document { } } # Do the leftover text. - $out .= &split_block($text,"") if $text; - return $out; + print &split_block($text,"") if $text; } sub split_block { @@ -268,7 +269,7 @@ sub preprocess { # We stopped one token from the end to allow for easy look-ahead. # Append it now. - $text = $text.$words[$i]; + $text = $text.$words[$i] if scalar(@words) > 0; # Clean up spaces at head and tail of each line as well as any double-spacing $text =~ s/ +/ /g;