214 lines
3.6 KiB
Perl
214 lines
3.6 KiB
Perl
|
#! /usr/bin/perl
|
|||
|
|
|||
|
use locale;
|
|||
|
|
|||
|
$linesPerFile = 20000;
|
|||
|
|
|||
|
if (@ARGV < 1) {
|
|||
|
print "usage: prep.pl dictionary_file\n";
|
|||
|
exit;
|
|||
|
}
|
|||
|
|
|||
|
$file = shift; # @ARGV;
|
|||
|
$kind = shift;
|
|||
|
|
|||
|
if ($kind eq "") {
|
|||
|
$kind="suf";
|
|||
|
}
|
|||
|
|
|||
|
# Przygotowanie etykiet
|
|||
|
|
|||
|
`makeLabels.pl > labels.sym`;
|
|||
|
|
|||
|
`lexmakelab labels`;
|
|||
|
|
|||
|
# Analiza pliku s<>ownika
|
|||
|
|
|||
|
|
|||
|
print "Kanonizuj<75> opisy.........................................";
|
|||
|
|
|||
|
`canon.pl <$file >temp2`;
|
|||
|
|
|||
|
print "OK\n";
|
|||
|
|
|||
|
print "Analizuje prefiksy.......................................";
|
|||
|
|
|||
|
`count_prefs.pl 2 4 < temp2 > prefs`;
|
|||
|
`sort -k1,1 -k3,3nr prefs > prefsS`;
|
|||
|
`cut_prefs.pl 0.5 0.01 100 prefsS > prefs`;
|
|||
|
`rm prefsS`;
|
|||
|
|
|||
|
print "OK\n";
|
|||
|
|
|||
|
print "Analizuj<75> plik s<>ownika";
|
|||
|
|
|||
|
if ($kind eq "pre") {
|
|||
|
print "(pre).............................";
|
|||
|
`stat_pre.pl temp2 > temp1`;
|
|||
|
} else {
|
|||
|
print "(suf).............................";
|
|||
|
`stat.pl prefs < temp2 > temp1`;
|
|||
|
}
|
|||
|
|
|||
|
print "OK\n";
|
|||
|
|
|||
|
# zmniejszamy plik...
|
|||
|
|
|||
|
print "Sortuj<75> plik.............................................";
|
|||
|
|
|||
|
`sort -t \\~ -k1,1 -k2,2nr <temp1 > temp2`;
|
|||
|
|
|||
|
print "OK\n";
|
|||
|
|
|||
|
print "Minimalizuj<75> plik s<>ownika...............................";
|
|||
|
|
|||
|
`rmDup.pl < temp2 > temp1`;
|
|||
|
#`rmDup2.pl < temp1 > temp2`;
|
|||
|
|
|||
|
`cp temp1 temp2`;
|
|||
|
|
|||
|
`rm temp1`;
|
|||
|
|
|||
|
print "OK\n";
|
|||
|
|
|||
|
#dzielimy plik na wiele cz<63><7A>ci, uruchamiamy lexcomplex dla ka<6B>dej
|
|||
|
#cz<63><7A>ci osobno, nast<73>pnie <20><>czymy to za pomoc<6F> programu fsmunion
|
|||
|
|
|||
|
print "Dziel<65> s<>ownik na mniejsze cz<63><7A>ci........................";
|
|||
|
|
|||
|
open(IN, "./temp2");
|
|||
|
|
|||
|
$lineCount = 0;
|
|||
|
$fileCount = 0;
|
|||
|
|
|||
|
`mkdir LemTEMP`;
|
|||
|
|
|||
|
open(FILE, ">LemTEMP/slo_0");
|
|||
|
|
|||
|
while (<IN>) {
|
|||
|
|
|||
|
if (++$lineCount >= $linesPerFile) {
|
|||
|
$fileCount++;
|
|||
|
$lineCount = 0;
|
|||
|
|
|||
|
close(FILE);
|
|||
|
# print "Tworz<72> nowy plik tymczasowy: slo_".$fileCount."\n";
|
|||
|
open(FILE, ">LemTEMP/slo_".$fileCount);
|
|||
|
}
|
|||
|
|
|||
|
print(FILE $_);
|
|||
|
}
|
|||
|
|
|||
|
print "OK\n";
|
|||
|
|
|||
|
print "Tworz<72> automaty po<70>rednie";
|
|||
|
|
|||
|
#32 kropki, fileCount plikow
|
|||
|
$filesPerDot = $fileCount/32;
|
|||
|
$files=$filesPerDot;
|
|||
|
$dots=0;
|
|||
|
|
|||
|
for ($i=0; $i<=$fileCount; $i++) {
|
|||
|
|
|||
|
if ($files >= $filesPerDot) {
|
|||
|
$files = 0;
|
|||
|
print ".";
|
|||
|
$dots++;
|
|||
|
}
|
|||
|
$files++;
|
|||
|
|
|||
|
$command = "lexcomplex -l labels.lab -S labels.scl < LemTEMP/slo_".$i." > LemTEMP/slownik_".$i.".fsm";
|
|||
|
|
|||
|
`$command`;
|
|||
|
|
|||
|
}
|
|||
|
if ($dots < 32) {
|
|||
|
for ($i=0; $i<32 - $dots; $i++) {
|
|||
|
print ".";
|
|||
|
}
|
|||
|
}
|
|||
|
|
|||
|
print "OK\n";
|
|||
|
|
|||
|
`rm LemTEMP/slo_*`;
|
|||
|
|
|||
|
print "Tworz<72> automat ko<6B>cowy";
|
|||
|
|
|||
|
#35 kropek...
|
|||
|
$filesPerDot = $fileCount/35;
|
|||
|
$files=$filesPerDot;
|
|||
|
$dots=0;
|
|||
|
|
|||
|
`cp LemTEMP/slownik_0.fsm slownik1.fsm`;
|
|||
|
|
|||
|
for ($i=1; $i<=$filecount; $i++) {
|
|||
|
|
|||
|
if ($files >= $filesPerDot) {
|
|||
|
$files = 0;
|
|||
|
print ".";
|
|||
|
$dots++;
|
|||
|
}
|
|||
|
$files++;
|
|||
|
|
|||
|
$command = "fsmunion LemTEMP/slownik_".$i." slownik1.fsm > slownik2.fsm";
|
|||
|
|
|||
|
`$command`;
|
|||
|
|
|||
|
`mv slownik2.fsm slownik1.fsm`;
|
|||
|
}
|
|||
|
|
|||
|
if ($dots < 35) {
|
|||
|
for ($i=0; $i<35 - $dots; $i++) {
|
|||
|
print ".";
|
|||
|
}
|
|||
|
}
|
|||
|
|
|||
|
`fsmunion LemTEMP/* > slownik1.fsm`;
|
|||
|
|
|||
|
print "OK\n";
|
|||
|
|
|||
|
print "Usuwam epsilon-przejscia.................................";
|
|||
|
|
|||
|
`fsmrmepsilon slownik1.fsm > slownik2.fsm`;
|
|||
|
|
|||
|
`rm slownik1.fsm`;
|
|||
|
|
|||
|
print "OK\n";
|
|||
|
|
|||
|
print "Determinizuj<75> automat....................................";
|
|||
|
|
|||
|
`fsmdeterminize slownik2.fsm > slownik1.fsm`;
|
|||
|
|
|||
|
`rm slownik2.fsm`;
|
|||
|
|
|||
|
print "OK\n";
|
|||
|
|
|||
|
print "Minimalizuj<75> automat.....................................";
|
|||
|
|
|||
|
`fsmminimize slownik1.fsm > slownik.fsm`;
|
|||
|
|
|||
|
`rm slownik1.fsm`;
|
|||
|
|
|||
|
print "OK\n";
|
|||
|
|
|||
|
print "Konwertuj<75> automat do formatu fsa........................";
|
|||
|
|
|||
|
`fsmprint -i labels.lab slownik.fsm > slownik.txt`;
|
|||
|
|
|||
|
`../fsm2aut slownik.txt > slownik.aut`;
|
|||
|
|
|||
|
`../aut2fsa < slownik.aut > gue.bin`;
|
|||
|
|
|||
|
print "OK\n";
|
|||
|
|
|||
|
print "Czyszcz<63> pliki pomocnicze................................";
|
|||
|
|
|||
|
`rm LemTEMP/*`;
|
|||
|
`rmdir LemTEMP`;
|
|||
|
`rm temp2`;
|
|||
|
`rm slownik.fsm`;
|
|||
|
`rm slownik.txt`;
|
|||
|
`rm slownik.aut`;
|
|||
|
|
|||
|
print "OK\n";
|