concordia-library/utf8case/generate_case_tables.pl

252 lines
6.4 KiB
Perl
Raw Normal View History

#!/usr/bin/perl
# Based on ideas from re2 library.
use strict;
use LWP::Simple;
use String::Util qw(hascontent);
use Data::Dumper;
use Clone qw(clone);
my $UNIDATA_PREFIX= q{http://unicode.org/Public/UNIDATA/};
my $OUTPUT_CPP_FILE = 'case_tables.cpp';
my @lower_case_ranges;
my @upper_case_ranges;
my @title_case_ranges;
my @lower_special_casing;
my @upper_special_casing;
my @title_special_casing;
open my $output_cpp_fh, '>', $OUTPUT_CPP_FILE;
generate_intro();
generate_standard_case_tables();
generate_special_casing_tables();
sub generate_intro {
print $output_cpp_fh <<'END_OF_INTRO';
// GENERATED AUTOMATICALLY BY generate_case_tables.pl; DO NOT EDIT.
#include "case_tables.hpp"
END_OF_INTRO
}
sub generate_standard_case_tables {
my @unicode_data_lines = download_unidata_file('UnicodeData.txt');
for my $line (@unicode_data_lines) {
append_to_case_ranges(\@upper_case_ranges, $line->[0], $line->[12]);
append_to_case_ranges(\@lower_case_ranges, $line->[0], $line->[13]);
append_to_case_ranges(\@title_case_ranges, $line->[0], $line->[14]);
}
@lower_case_ranges = compactify(\@lower_case_ranges);
@upper_case_ranges = compactify(\@upper_case_ranges);
@title_case_ranges = compactify(\@title_case_ranges);
write_case_table('lower_case_ranges', \@lower_case_ranges);
print "\n";
write_case_table('upper_case_ranges', \@upper_case_ranges);
print "\n";
write_case_table('title_case_ranges', \@title_case_ranges);
print "\n";
}
sub generate_special_casing_tables {
my @special_casing_lines = download_unidata_file('SpecialCasing.txt');
for my $line (@special_casing_lines) {
if (hascontent($line->[4])) {
print STDERR "This cannot be handled: ", join('; ', @{$line}),"\n";
} else {
append_to_special_casing_table(\@lower_special_casing, $line->[0], $line->[1]);
append_to_special_casing_table(\@title_special_casing, $line->[0], $line->[2]);
append_to_special_casing_table(\@upper_special_casing, $line->[0], $line->[3]);
}
}
write_special_casing_table('lower_special_casing', \@lower_special_casing);
print "\n";
write_special_casing_table('title_special_casing', \@title_special_casing);
print "\n";
write_special_casing_table('upper_special_casing', \@upper_special_casing);
}
sub download_unidata_file {
my ($file_name) = @_;
my $url = $UNIDATA_PREFIX . $file_name;
print STDERR "Downloading ${url}...\n";
my $contents = get($url);
return map { [ split/\s*;\s*/ ] }
grep { /\S/ }
map{ s/\#.*\Z//; $_}
split/\r?\n/, $contents;
}
sub append_to_case_ranges {
my ($case_ranges_ref, $hex_code_point, $hex_modified_code_point) = @_;
if (!hascontent($hex_modified_code_point)) {
return;
}
my $code_point = hex($hex_code_point);
my $modified_code_point = hex($hex_modified_code_point);
push @{$case_ranges_ref},
[ $code_point, $code_point, delta($code_point, $modified_code_point) ];
}
sub compactify {
my ($case_ranges_ref) = @_;
my @new_table;
my $current_compact_range;
for my $range (@{$case_ranges_ref}) {
if (!defined($current_compact_range)) {
$current_compact_range = clone($range);
} elsif ($range->[2] eq $current_compact_range->[2]
&& $range->[0] == $current_compact_range->[1] + 1) {
++$current_compact_range->[1];
} elsif ($range->[2] eq de_skip($current_compact_range->[2])
&& $range->[0] == $current_compact_range->[1] + 2) {
$current_compact_range->[1] += 2;
$current_compact_range->[2] = add_skip($current_compact_range->[2]);
} else {
push @new_table, $current_compact_range;
$current_compact_range = clone($range);
}
}
push @new_table, $current_compact_range;
return @new_table;
}
sub write_case_table {
my ($name, $case_ranges_ref) = @_;
my $table_name = uc($name);
my $size_constant_name = $table_name . "_SIZE";
my $table_size = $#{$case_ranges_ref} + 1;
print $output_cpp_fh <<"END_OF_INTRO";
const size_t $size_constant_name = $table_size;
const CaseConversionRecord ${table_name}[$size_constant_name] = {
END_OF_INTRO
my $string_to_prepend = '';
for my $range (@{$case_ranges_ref}) {
my $from = $range->[0];
my $to = $range->[1];
my $delta = $range->[2];
print $output_cpp_fh "${string_to_prepend} {$from, $to, $delta}";
$string_to_prepend = ",\n";
}
print $output_cpp_fh "\n};\n";
}
sub append_to_special_casing_table {
my ($special_casing_table_ref, $hex_code_point, $hex_code_point_vector) = @_;
if (!hascontent($hex_code_point_vector)) {
return;
}
my $code_point = hex($hex_code_point);
my @code_point_vector = map { hex($_) } split/\s+/, $hex_code_point_vector;
push $special_casing_table_ref, [$code_point, cpp_encode(@code_point_vector)];
}
sub write_special_casing_table {
my ($name, $special_casing_table_ref) = @_;
my $table_name = uc($name);
my $size_constant_name = $table_name . "_SIZE";
my $table_size = $#{$special_casing_table_ref} + 1;
print $output_cpp_fh <<"END_OF_INTRO";
const size_t $size_constant_name = $table_size;
const SpecialCasingConversionRecord ${table_name}[$size_constant_name] = {
END_OF_INTRO
my $string_to_prepend = '';
for my $item (@{$special_casing_table_ref}) {
my $code_point = $item->[0];
my $replacement = $item->[1];
print $output_cpp_fh "${string_to_prepend} {$code_point, \"$replacement\"}";
$string_to_prepend = ",\n";
}
print $output_cpp_fh "\n};\n";
}
sub cpp_encode {
my (@v) = @_;
my $s = join('', map{ chr($_) } @v);
return join('', map { "\\x$_" } unpack("U0(H2)*", $s));
}
sub de_skip {
my ($delta) = @_;
if ($delta =~ /^(EVEN_ODD|ODD_EVEN)(?:_SKIP)?$/) {
return $1;
}
return 'CANNOT_BE_SKIPPED';
}
sub add_skip {
my ($delta) = @_;
return de_skip($delta) . '_SKIP';
}
sub delta {
my ($a, $b) = @_;
if ($a + 1 == $b) {
if ($a % 2 == 0) {
return 'EVEN_ODD'
}
else {
return 'ODD_EVEN';
}
} elsif ($a == $b + 1) {
if ($a % 2 == 0) {
return 'ODD_EVEN';
}
else {
return 'EVEN_ODD';
}
}
return $b - $a;
}