concordia-server/mgiza-aligner/mgiza/mgizapp/scripts/snt2cooc.pl

66 lines
1.3 KiB
Perl
Raw Normal View History

2017-01-21 17:07:36 +01:00
#!/usr/bin/perl -w
# sntcooc.perl [-sort-buffer-size 200M] [-sort-batch-size 253] [-sort-compress gzip] output vcb1 vcb2 snt12
#
# This file is part of mgiza++. Its use is licensed under the GNU General
# Public License version 2 or, at your option, any later version.
use strict;
use File::Basename;
use FindBin qw($Bin);
sub systemCheck($);
my $sortArgs = "";
for (my $i = 0; $i < (@ARGV - 4); ++$i)
{
my $arg = $ARGV[$i];
if ($arg eq "-sort-buffer-size")
{
$sortArgs .= " -S " .$ARGV[++$i];
}
elsif ($arg eq "-sort-batch-size")
{
$sortArgs .= " --batch-size " .$ARGV[++$i];
}
elsif ($arg eq "-sort-compress")
{
$sortArgs .= " --compress-program " .$ARGV[++$i];
}
}
my $out = $ARGV[@ARGV - 4];
my $vcb1 = $ARGV[@ARGV - 3];
my $vcb2 = $ARGV[@ARGV - 2];
my $snt12 = $ARGV[@ARGV - 1];
my $SORT_EXEC = `gsort --help 2>/dev/null`;
if($SORT_EXEC) {
$SORT_EXEC = 'gsort';
}
else {
$SORT_EXEC = 'sort';
}
my $TMPDIR=dirname($out);
my $cmd;
$cmd = "$Bin/snt2coocrmp $vcb1 $vcb2 $snt12 ";
$cmd .= "| $SORT_EXEC $sortArgs -T $TMPDIR -nk 1 -nk 2 | uniq > $out";
systemCheck($cmd);
#############################
sub systemCheck($)
{
my $cmd = shift;
print STDERR "Executing $cmd \n";
my $retVal = system($cmd);
if ($retVal != 0)
{
exit(1);
}
}