252 lines
6.4 KiB
Perl
252 lines
6.4 KiB
Perl
|
#!/usr/bin/perl
|
||
|
|
||
|
# Based on ideas from re2 library.
|
||
|
|
||
|
use strict;
|
||
|
use LWP::Simple;
|
||
|
use String::Util qw(hascontent);
|
||
|
use Data::Dumper;
|
||
|
use Clone qw(clone);
|
||
|
|
||
|
my $UNIDATA_PREFIX= q{http://unicode.org/Public/UNIDATA/};
|
||
|
my $OUTPUT_CPP_FILE = 'case_tables.cpp';
|
||
|
|
||
|
my @lower_case_ranges;
|
||
|
my @upper_case_ranges;
|
||
|
my @title_case_ranges;
|
||
|
|
||
|
my @lower_special_casing;
|
||
|
my @upper_special_casing;
|
||
|
my @title_special_casing;
|
||
|
|
||
|
open my $output_cpp_fh, '>', $OUTPUT_CPP_FILE;
|
||
|
generate_intro();
|
||
|
generate_standard_case_tables();
|
||
|
generate_special_casing_tables();
|
||
|
|
||
|
sub generate_intro {
|
||
|
print $output_cpp_fh <<'END_OF_INTRO';
|
||
|
// GENERATED AUTOMATICALLY BY generate_case_tables.pl; DO NOT EDIT.
|
||
|
|
||
|
#include "case_tables.hpp"
|
||
|
|
||
|
END_OF_INTRO
|
||
|
}
|
||
|
|
||
|
sub generate_standard_case_tables {
|
||
|
my @unicode_data_lines = download_unidata_file('UnicodeData.txt');
|
||
|
|
||
|
for my $line (@unicode_data_lines) {
|
||
|
append_to_case_ranges(\@upper_case_ranges, $line->[0], $line->[12]);
|
||
|
append_to_case_ranges(\@lower_case_ranges, $line->[0], $line->[13]);
|
||
|
append_to_case_ranges(\@title_case_ranges, $line->[0], $line->[14]);
|
||
|
}
|
||
|
|
||
|
@lower_case_ranges = compactify(\@lower_case_ranges);
|
||
|
@upper_case_ranges = compactify(\@upper_case_ranges);
|
||
|
@title_case_ranges = compactify(\@title_case_ranges);
|
||
|
|
||
|
write_case_table('lower_case_ranges', \@lower_case_ranges);
|
||
|
print "\n";
|
||
|
|
||
|
write_case_table('upper_case_ranges', \@upper_case_ranges);
|
||
|
print "\n";
|
||
|
|
||
|
write_case_table('title_case_ranges', \@title_case_ranges);
|
||
|
print "\n";
|
||
|
}
|
||
|
|
||
|
sub generate_special_casing_tables {
|
||
|
my @special_casing_lines = download_unidata_file('SpecialCasing.txt');
|
||
|
|
||
|
for my $line (@special_casing_lines) {
|
||
|
if (hascontent($line->[4])) {
|
||
|
print STDERR "This cannot be handled: ", join('; ', @{$line}),"\n";
|
||
|
} else {
|
||
|
append_to_special_casing_table(\@lower_special_casing, $line->[0], $line->[1]);
|
||
|
append_to_special_casing_table(\@title_special_casing, $line->[0], $line->[2]);
|
||
|
append_to_special_casing_table(\@upper_special_casing, $line->[0], $line->[3]);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
write_special_casing_table('lower_special_casing', \@lower_special_casing);
|
||
|
print "\n";
|
||
|
|
||
|
write_special_casing_table('title_special_casing', \@title_special_casing);
|
||
|
print "\n";
|
||
|
|
||
|
write_special_casing_table('upper_special_casing', \@upper_special_casing);
|
||
|
}
|
||
|
|
||
|
sub download_unidata_file {
|
||
|
my ($file_name) = @_;
|
||
|
|
||
|
my $url = $UNIDATA_PREFIX . $file_name;
|
||
|
|
||
|
print STDERR "Downloading ${url}...\n";
|
||
|
|
||
|
my $contents = get($url);
|
||
|
|
||
|
return map { [ split/\s*;\s*/ ] }
|
||
|
grep { /\S/ }
|
||
|
map{ s/\#.*\Z//; $_}
|
||
|
split/\r?\n/, $contents;
|
||
|
}
|
||
|
|
||
|
sub append_to_case_ranges {
|
||
|
my ($case_ranges_ref, $hex_code_point, $hex_modified_code_point) = @_;
|
||
|
|
||
|
if (!hascontent($hex_modified_code_point)) {
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
my $code_point = hex($hex_code_point);
|
||
|
my $modified_code_point = hex($hex_modified_code_point);
|
||
|
|
||
|
push @{$case_ranges_ref},
|
||
|
[ $code_point, $code_point, delta($code_point, $modified_code_point) ];
|
||
|
}
|
||
|
|
||
|
sub compactify {
|
||
|
my ($case_ranges_ref) = @_;
|
||
|
|
||
|
my @new_table;
|
||
|
|
||
|
my $current_compact_range;
|
||
|
|
||
|
for my $range (@{$case_ranges_ref}) {
|
||
|
if (!defined($current_compact_range)) {
|
||
|
$current_compact_range = clone($range);
|
||
|
} elsif ($range->[2] eq $current_compact_range->[2]
|
||
|
&& $range->[0] == $current_compact_range->[1] + 1) {
|
||
|
++$current_compact_range->[1];
|
||
|
} elsif ($range->[2] eq de_skip($current_compact_range->[2])
|
||
|
&& $range->[0] == $current_compact_range->[1] + 2) {
|
||
|
$current_compact_range->[1] += 2;
|
||
|
$current_compact_range->[2] = add_skip($current_compact_range->[2]);
|
||
|
} else {
|
||
|
push @new_table, $current_compact_range;
|
||
|
$current_compact_range = clone($range);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
push @new_table, $current_compact_range;
|
||
|
|
||
|
return @new_table;
|
||
|
}
|
||
|
|
||
|
sub write_case_table {
|
||
|
my ($name, $case_ranges_ref) = @_;
|
||
|
|
||
|
my $table_name = uc($name);
|
||
|
my $size_constant_name = $table_name . "_SIZE";
|
||
|
my $table_size = $#{$case_ranges_ref} + 1;
|
||
|
|
||
|
|
||
|
print $output_cpp_fh <<"END_OF_INTRO";
|
||
|
const size_t $size_constant_name = $table_size;
|
||
|
const CaseConversionRecord ${table_name}[$size_constant_name] = {
|
||
|
END_OF_INTRO
|
||
|
|
||
|
my $string_to_prepend = '';
|
||
|
|
||
|
for my $range (@{$case_ranges_ref}) {
|
||
|
my $from = $range->[0];
|
||
|
my $to = $range->[1];
|
||
|
my $delta = $range->[2];
|
||
|
|
||
|
print $output_cpp_fh "${string_to_prepend} {$from, $to, $delta}";
|
||
|
|
||
|
$string_to_prepend = ",\n";
|
||
|
}
|
||
|
|
||
|
|
||
|
print $output_cpp_fh "\n};\n";
|
||
|
}
|
||
|
|
||
|
sub append_to_special_casing_table {
|
||
|
my ($special_casing_table_ref, $hex_code_point, $hex_code_point_vector) = @_;
|
||
|
|
||
|
if (!hascontent($hex_code_point_vector)) {
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
my $code_point = hex($hex_code_point);
|
||
|
my @code_point_vector = map { hex($_) } split/\s+/, $hex_code_point_vector;
|
||
|
|
||
|
push $special_casing_table_ref, [$code_point, cpp_encode(@code_point_vector)];
|
||
|
}
|
||
|
|
||
|
sub write_special_casing_table {
|
||
|
my ($name, $special_casing_table_ref) = @_;
|
||
|
|
||
|
my $table_name = uc($name);
|
||
|
my $size_constant_name = $table_name . "_SIZE";
|
||
|
my $table_size = $#{$special_casing_table_ref} + 1;
|
||
|
|
||
|
print $output_cpp_fh <<"END_OF_INTRO";
|
||
|
const size_t $size_constant_name = $table_size;
|
||
|
const SpecialCasingConversionRecord ${table_name}[$size_constant_name] = {
|
||
|
END_OF_INTRO
|
||
|
|
||
|
my $string_to_prepend = '';
|
||
|
|
||
|
for my $item (@{$special_casing_table_ref}) {
|
||
|
my $code_point = $item->[0];
|
||
|
my $replacement = $item->[1];
|
||
|
|
||
|
print $output_cpp_fh "${string_to_prepend} {$code_point, \"$replacement\"}";
|
||
|
|
||
|
$string_to_prepend = ",\n";
|
||
|
}
|
||
|
|
||
|
print $output_cpp_fh "\n};\n";
|
||
|
}
|
||
|
|
||
|
sub cpp_encode {
|
||
|
my (@v) = @_;
|
||
|
|
||
|
my $s = join('', map{ chr($_) } @v);
|
||
|
|
||
|
return join('', map { "\\x$_" } unpack("U0(H2)*", $s));
|
||
|
}
|
||
|
|
||
|
sub de_skip {
|
||
|
my ($delta) = @_;
|
||
|
|
||
|
if ($delta =~ /^(EVEN_ODD|ODD_EVEN)(?:_SKIP)?$/) {
|
||
|
return $1;
|
||
|
}
|
||
|
|
||
|
return 'CANNOT_BE_SKIPPED';
|
||
|
}
|
||
|
|
||
|
sub add_skip {
|
||
|
my ($delta) = @_;
|
||
|
|
||
|
return de_skip($delta) . '_SKIP';
|
||
|
}
|
||
|
|
||
|
|
||
|
sub delta {
|
||
|
my ($a, $b) = @_;
|
||
|
|
||
|
if ($a + 1 == $b) {
|
||
|
if ($a % 2 == 0) {
|
||
|
return 'EVEN_ODD'
|
||
|
}
|
||
|
else {
|
||
|
return 'ODD_EVEN';
|
||
|
}
|
||
|
} elsif ($a == $b + 1) {
|
||
|
if ($a % 2 == 0) {
|
||
|
return 'ODD_EVEN';
|
||
|
}
|
||
|
else {
|
||
|
return 'EVEN_ODD';
|
||
|
}
|
||
|
}
|
||
|
|
||
|
return $b - $a;
|
||
|
}
|